alpcentaur
/
basabuuka_prototyp


								# cython: infer_types=True

								# cython: cdivision=True


								cimport cython

								from libc.stdint cimport int32_t

								from libc.string cimport memset, memcpy

								from cymem.cymem cimport Pool


								from .typedefs cimport weight_t


								include "compile_time_constants.pxi"


								IF USE_BLAS:

								    from blis cimport cy as blis


								cdef extern from "math.h" nogil:

								    weight_t exp(weight_t x)

								    weight_t sqrt(weight_t x)


								cdef class Matrix:

								    cdef readonly Pool mem

								    cdef weight_t* data

								    cdef readonly int32_t nr_row

								    cdef readonly int32_t nr_col


								cdef class Vec:

								    @staticmethod

								    cdef inline int arg_max(const weight_t* scores, const int n_classes) nogil:

								        if n_classes == 2:

								            return 0 if scores[0] > scores[1] else 1

								        cdef int i

								        cdef int best = 0

								        cdef weight_t mode = scores[0]

								        for i in range(1, n_classes):

								            if scores[i] > mode:

								                mode = scores[i]

								                best = i

								        return best


								    @staticmethod

								    cdef inline weight_t max(const weight_t* x, int32_t nr) nogil:

								        if nr == 0:

								            return 0

								        cdef int i

								        cdef weight_t mode = x[0]

								        for i in range(1, nr):

								            if x[i] > mode:

								                mode = x[i]

								        return mode


								    @staticmethod

								    cdef inline weight_t sum(const weight_t* vec, int32_t nr) nogil:

								        cdef int i

								        cdef weight_t total = 0

								        for i in range(nr):

								            total += vec[i]

								        return total


								    @staticmethod

								    cdef inline weight_t norm(const weight_t* vec, int32_t nr) nogil:

								        cdef weight_t total = 0

								        for i in range(nr):

								            total += vec[i] ** 2

								        return sqrt(total)


								    @staticmethod

								    cdef inline void add(weight_t* output, const weight_t* x,

								            weight_t inc, int32_t nr) nogil:

								        memcpy(output, x, sizeof(output[0]) * nr)

								        Vec.add_i(output, inc, nr)


								    @staticmethod

								    cdef inline void add_i(weight_t* vec, weight_t inc, int32_t nr) nogil:

								        cdef int i

								        for i in range(nr):

								            vec[i] += inc


								    @staticmethod

								    cdef inline void mul(weight_t* output, const weight_t* vec, weight_t scal,

								            int32_t nr) nogil:

								        memcpy(output, vec, sizeof(output[0]) * nr)

								        Vec.mul_i(output, scal, nr)


								    @staticmethod

								    cdef inline void mul_i(weight_t* vec, weight_t scal, int32_t nr) nogil:

								        cdef int i

								        IF USE_BLAS:

								            blis.scalv(blis.NO_CONJUGATE, nr, scal, vec, 1)

								        ELSE:

								            for i in range(nr):

								                vec[i] *= scal


								    @staticmethod

								    cdef inline void pow(weight_t* output, const weight_t* vec, weight_t scal,

								            int32_t nr) nogil:

								        memcpy(output, vec, sizeof(output[0]) * nr)

								        Vec.pow_i(output, scal, nr)


								    @staticmethod

								    cdef inline void pow_i(weight_t* vec, const weight_t scal, int32_t nr) nogil:

								        cdef int i

								        for i in range(nr):

								            vec[i] **= scal


								    @staticmethod

								    @cython.cdivision(True)

								    cdef inline void div(weight_t* output, const weight_t* vec, weight_t scal,

								            int32_t nr) nogil:

								        memcpy(output, vec, sizeof(output[0]) * nr)

								        Vec.div_i(output, scal, nr)


								    @staticmethod

								    @cython.cdivision(True)

								    cdef inline void div_i(weight_t* vec, const weight_t scal, int32_t nr) nogil:

								        cdef int i

								        for i in range(nr):

								            vec[i] /= scal


								    @staticmethod

								    cdef inline void exp(weight_t* output, const weight_t* vec, int32_t nr) nogil:

								        memcpy(output, vec, sizeof(output[0]) * nr)

								        Vec.exp_i(output, nr)


								    @staticmethod

								    cdef inline void exp_i(weight_t* vec, int32_t nr) nogil:

								        cdef int i

								        for i in range(nr):

								            vec[i] = exp(vec[i])


								    @staticmethod

								    cdef inline void reciprocal_i(weight_t* vec, int32_t nr) nogil:

								        cdef int i

								        for i in range(nr):

								            vec[i] = 1.0 / vec[i]


								cdef class VecVec:

								    @staticmethod

								    cdef inline void add(weight_t* output,

								                         const weight_t* x,

								                         const weight_t* y,

								                         weight_t scale,

								                         int32_t nr) nogil:

								        memcpy(output, x, sizeof(output[0]) * nr)

								        VecVec.add_i(output, y, scale, nr)


								    @staticmethod

								    cdef inline void add_i(weight_t* x,

								                           const weight_t* y,

								                           weight_t scale,

								                           int32_t nr) nogil:

								        cdef int i

								        IF USE_BLAS:

								            blis.axpyv(blis.NO_CONJUGATE, nr, scale, <weight_t*>y, 1, x, 1)

								        ELSE:

								            for i in range(nr):

								                x[i] += y[i] * scale


								    @staticmethod

								    cdef inline void batch_add_i(weight_t* x,

								                           const weight_t* y,

								                           weight_t scale,

								                           int32_t nr, int32_t nr_batch) nogil:

								        # For fixed x, matrix of y

								        cdef int i, _

								        for _ in range(nr_batch):

								            VecVec.add_i(x,

								                y, scale, nr)

								            y += nr


								    @staticmethod

								    cdef inline void add_pow(weight_t* output,

								            const weight_t* x, const weight_t* y, weight_t power, int32_t nr) nogil:

								        memcpy(output, x, sizeof(output[0]) * nr)

								        VecVec.add_pow_i(output, y, power, nr)


								    @staticmethod

								    cdef inline void add_pow_i(weight_t* x,

								            const weight_t* y, weight_t power, int32_t nr) nogil:

								        cdef int i

								        for i in range(nr):

								            x[i] += y[i] ** power


								    @staticmethod

								    cdef inline void mul(weight_t* output,

								            const weight_t* x, const weight_t* y, int32_t nr) nogil:

								        memcpy(output, x, sizeof(output[0]) * nr)

								        VecVec.mul_i(output, y, nr)


								    @staticmethod

								    cdef inline void mul_i(weight_t* x,

								            const weight_t* y, int32_t nr) nogil:

								        cdef int i

								        for i in range(nr):

								            x[i] *= y[i]


								    @staticmethod

								    cdef inline weight_t dot(

								            const weight_t* x, const weight_t* y, int32_t nr) nogil:

								        cdef int i

								        cdef weight_t total = 0

								        for i in range(nr):

								            total += x[i] * y[i]

								        return total


								    @staticmethod

								    cdef inline int arg_max_if_true(

								            const weight_t* scores, const int* is_valid, const int n_classes) nogil:

								        cdef int i

								        cdef int best = -1

								        for i in range(n_classes):

								            if is_valid[i] and (best == -1 or scores[i] > scores[best]):

								                best = i

								        return best


								    @staticmethod

								    cdef inline int arg_max_if_zero(

								            const weight_t* scores, const weight_t* costs, const int n_classes) nogil:

								        cdef int i

								        cdef int best = -1

								        for i in range(n_classes):

								            if costs[i] == 0 and (best == -1 or scores[i] > scores[best]):

								                best = i

								        return best


								cdef class Mat:

								    @staticmethod

								    cdef inline void mean_row(weight_t* Ex,

								            const weight_t* mat, int32_t nr_row, int32_t nr_col) nogil:

								        memset(Ex, 0, sizeof(Ex[0]) * nr_col)

								        for i in range(nr_row):

								            VecVec.add_i(Ex, &mat[i * nr_col], 1.0, nr_col)

								        Vec.mul_i(Ex, 1.0 / nr_row, nr_col)


								    @staticmethod

								    cdef inline void var_row(weight_t* Vx,

								            const weight_t* mat, const weight_t* Ex,

								            int32_t nr_row, int32_t nr_col, weight_t eps) nogil:

								        # From https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance

								        if nr_row == 0 or nr_col == 0:

								            return

								        cdef weight_t sum_, sum2

								        for i in range(nr_col):

								            sum_ = 0.0

								            sum2 = 0.0

								            for j in range(nr_row):

								                x = mat[j * nr_col + i]

								                sum2 += (x - Ex[i]) ** 2

								                sum_ += x - Ex[i]

								            Vx[i] = (sum2 - sum_**2 / nr_row) / nr_row

								            Vx[i] += eps


								cdef class MatVec:

								    @staticmethod

								    cdef inline void add_i(weight_t* mat,

								            const weight_t* vec, weight_t scale, int32_t nr_row, int32_t nr_col) nogil:

								        cdef int i

								        for i in range(nr_row):

								            VecVec.add_i(mat + (i * nr_col),

								                vec, scale, nr_col)


								    @staticmethod

								    cdef inline void mul(weight_t* output,

								                         const weight_t* mat,

								                         const weight_t* vec,

								                         int32_t nr_row, int32_t nr_col) nogil:

								        memcpy(output, mat, sizeof(output[0]) * nr_row * nr_col)

								        MatVec.mul_i(output, vec, nr_row, nr_col)


								    @staticmethod

								    cdef inline void mul_i(weight_t* mat,

								                           const weight_t* vec,

								                           int32_t nr_row, int32_t nr_col) nogil:

								        cdef int i, row, col

								        for i in range(nr_row):

								            row = i * nr_col

								            for col in range(nr_col):

								                mat[row + col] *= vec[col]


								    @staticmethod

								    cdef inline void dot(weight_t* output,

								                         const weight_t* mat,

								                         const weight_t* vec,

								                         int32_t nr_row, int32_t nr_col) nogil:

								        cdef int i, row, col

								        cdef double zero = 0.0

								        IF USE_BLAS:

								            blis.gemv(

								                blis.NO_TRANSPOSE,

								                blis.NO_CONJUGATE,

								                nr_row,

								                nr_col,

								                1.0,

								                <weight_t*>mat, nr_col, 1,

								                <weight_t*>vec, 1,

								                1.0,

								                output, 1

								            )

								        ELSE:

								            for i in range(nr_row):

								                row = i * nr_col

								                for col in range(nr_col):

								                    output[i] += mat[row + col] * vec[col]


								    @staticmethod

								    cdef inline void batch_dot(weight_t* output,

								                         const weight_t* mat,

								                         const weight_t* vec,

								                         int32_t nr_row, int32_t nr_col, int32_t nr_batch) nogil:

								        # Output dim: batch_size * nr_row

								        # vec dim:    batch_size * nr_col

								        # mat dim:    nr_row     * nr_col

								        # batch_size must be M, because can't transpose C

								        # so nr_row must be N

								        # so nr_col must be K


								        # vec:   M * K

								        # mat.T: K * N

								        # out:   M * N

								        cdef int i, row, col

								        cdef double one = 1.0

								        IF USE_BLAS:

								            blis.gemm(

								                blis.NO_TRANSPOSE,

								                blis.TRANSPOSE,

								                nr_batch,

								                nr_row,

								                nr_col,

								                1.0,

								                <weight_t*>vec,

								                nr_col,

								                1,

								                <weight_t*>mat,

								                nr_col,

								                1,

								                1.0,

								                output,

								                nr_row,

								                1)

								        ELSE:

								            for b in range(nr_batch):

								                MatVec.dot(output,

								                    mat, vec, nr_row, nr_col)

								                output += nr_row

								                vec += nr_col


								    @staticmethod

								    cdef inline void T_dot(weight_t* output,

								                             const weight_t* mat,

								                             const weight_t* vec,

								                             int32_t nr_row,

								                             int32_t nr_col) nogil:

								        cdef int i, row, col

								        cdef double zero = 0.0

								        cdef double one = 1.0

								        IF USE_BLAS:

								            blis.gemv(

								                blis.TRANSPOSE,

								                blis.NO_CONJUGATE,

								                nr_row, nr_col,

								                1.0,

								                <weight_t*>mat, nr_col, 1,

								                <weight_t*>vec, 1,

								                1.0,

								                output, 1,

								            )

								        ELSE:

								            for row in range(nr_row):

								                for col in range(nr_col):

								                    output[col] += vec[row] * mat[(row * nr_col) + col]


								    @staticmethod

								    cdef inline void batch_T_dot(weight_t* output,

								                             const weight_t* mat,

								                             const weight_t* vec,

								                             int32_t nr_row,

								                             int32_t nr_col,

								                             int32_t nr_batch) nogil:

								        cdef int _

								        cdef double one = 1.0

								        IF USE_BLAS:

								            # output is (nr_batch, nr_col)

								            # mat is (nr_row, nr_col)

								            # vec is (nr_batch, nr_row)

								            # Output defined as (M, N)

								            # So

								            # nr_batch = M

								            # nr_col = N

								            # nr_row = K

								            #

								            # vec:  M * K

								            # mat:  K * N

								            # out:  M * N

								            blis.gemm(

								                blis.NO_TRANSPOSE,

								                blis.NO_TRANSPOSE,

								                nr_batch,

								                nr_col,

								                nr_row,

								                1.0,

								                <weight_t*>vec,

								                nr_row,

								                1,

								                <weight_t*>mat,

								                nr_col,

								                1,

								                1.0,

								                output,

								                nr_col,

								                1)

								        ELSE:

								            for _ in range(nr_batch):

								                MatVec.T_dot(output,

								                    mat, vec, nr_row, nr_col)

								                output += nr_col

								                vec += nr_row


								cdef class MatMat:

								    @staticmethod

								    cdef inline void add(weight_t* output,

								                         const weight_t* x,

								                         const weight_t* y,

								                         int32_t nr_row, int32_t nr_col) nogil:

								        memcpy(output, x, sizeof(output[0]) * nr_row * nr_col)

								        MatMat.add_i(output, y, nr_row, nr_col)


								    @staticmethod

								    cdef inline void add_i(weight_t* x,

								                           const weight_t* y,

								                           int32_t nr_row, int32_t nr_col) nogil:

								        cdef int i, row, col

								        for i in range(nr_row):

								            row = i * nr_col

								            for col in range(nr_col):

								                x[row + col] += y[row + col]


								    @staticmethod

								    cdef inline void mul(weight_t* output,

								                         const weight_t* x,

								                         const weight_t* y,

								                         int32_t nr_row, int32_t nr_col) nogil:

								        memcpy(output, x, sizeof(output[0]) * nr_row * nr_col)

								        MatMat.mul_i(output, y, nr_row, nr_col)


								    @staticmethod

								    cdef inline void mul_i(weight_t* x,

								                           const weight_t* y,

								                           int32_t nr_row, int32_t nr_col) nogil:

								        cdef int i, row, col

								        for i in range(nr_row):

								            row = i * nr_col

								            for col in range(nr_col):

								                x[row + col] *= y[row + col]


								    @staticmethod

								    cdef inline void add_outer_i(weight_t* mat,

								                                 const weight_t* x,

								                                 const weight_t* y,

								                                 int32_t nr_row,

								                                 int32_t nr_col) nogil:

								        cdef int i, j, row

								        cdef double one = 1.0

								        IF USE_BLAS:

								            blis.ger(

								                blis.NO_CONJUGATE, blis.NO_CONJUGATE,

								                nr_row, nr_col,

								                1.0,

								                <weight_t*>x, 1,

								                <weight_t*>y, 1,

								                mat, nr_col, 1

								            )

								        ELSE:

								            for i in range(nr_row):

								                row = i * nr_col

								                for j in range(nr_col):

								                    mat[row + j] += x[i] * y[j]


								    @staticmethod

								    cdef inline void batch_add_outer_i(weight_t* output,

								                                 const weight_t* x,

								                                 const weight_t* y,

								                                 int32_t nr_row,

								                                 int32_t nr_col,

								                                 int32_t nr_batch) nogil:

								        # Output dim: nr_row * nr_col

								        # x dim:    batch_size * nr_row

								        # y dim:    batch_size * nr_col

								        #

								        # Output is M*N (can't transpose)

								        # nr_row = M

								        # nr_col = N

								        # batch_size = K


								        # x.T:  M * K

								        # y:    K * N

								        # out:  M * N

								        cdef double one = 1.0

								        IF USE_BLAS:

								            blis.gemm(

								                blis.TRANSPOSE,

								                blis.NO_TRANSPOSE,

								                nr_row,

								                nr_col,

								                nr_batch,

								                1.0,

								                <weight_t*>x,

								                nr_row,

								                1,

								                <weight_t*>y,

								                nr_col,

								                1,

								                1.0,

								                output,

								                nr_col,

								                1)

								        ELSE:

								            for _ in range(nr_batch):

								                for i in range(nr_row):

								                    row = i * nr_col

								                    for j in range(nr_col):

								                        output[row + j] += x[i] * y[j]

								                x += nr_row

								                y += nr_col