# cython: infer_types=True # cython: cdivision=True cimport cython from libc.stdint cimport int32_t from libc.string cimport memset, memcpy from cymem.cymem cimport Pool from .typedefs cimport weight_t include "compile_time_constants.pxi" IF USE_BLAS: from blis cimport cy as blis cdef extern from "math.h" nogil: weight_t exp(weight_t x) weight_t sqrt(weight_t x) cdef class Matrix: cdef readonly Pool mem cdef weight_t* data cdef readonly int32_t nr_row cdef readonly int32_t nr_col cdef class Vec: @staticmethod cdef inline int arg_max(const weight_t* scores, const int n_classes) nogil: if n_classes == 2: return 0 if scores[0] > scores[1] else 1 cdef int i cdef int best = 0 cdef weight_t mode = scores[0] for i in range(1, n_classes): if scores[i] > mode: mode = scores[i] best = i return best @staticmethod cdef inline weight_t max(const weight_t* x, int32_t nr) nogil: if nr == 0: return 0 cdef int i cdef weight_t mode = x[0] for i in range(1, nr): if x[i] > mode: mode = x[i] return mode @staticmethod cdef inline weight_t sum(const weight_t* vec, int32_t nr) nogil: cdef int i cdef weight_t total = 0 for i in range(nr): total += vec[i] return total @staticmethod cdef inline weight_t norm(const weight_t* vec, int32_t nr) nogil: cdef weight_t total = 0 for i in range(nr): total += vec[i] ** 2 return sqrt(total) @staticmethod cdef inline void add(weight_t* output, const weight_t* x, weight_t inc, int32_t nr) nogil: memcpy(output, x, sizeof(output[0]) * nr) Vec.add_i(output, inc, nr) @staticmethod cdef inline void add_i(weight_t* vec, weight_t inc, int32_t nr) nogil: cdef int i for i in range(nr): vec[i] += inc @staticmethod cdef inline void mul(weight_t* output, const weight_t* vec, weight_t scal, int32_t nr) nogil: memcpy(output, vec, sizeof(output[0]) * nr) Vec.mul_i(output, scal, nr) @staticmethod cdef inline void mul_i(weight_t* vec, weight_t scal, int32_t nr) nogil: cdef int i IF USE_BLAS: blis.scalv(blis.NO_CONJUGATE, nr, scal, vec, 1) ELSE: for i in range(nr): vec[i] *= scal @staticmethod cdef inline void pow(weight_t* output, const weight_t* vec, weight_t scal, int32_t nr) nogil: memcpy(output, vec, sizeof(output[0]) * nr) Vec.pow_i(output, scal, nr) @staticmethod cdef inline void pow_i(weight_t* vec, const weight_t scal, int32_t nr) nogil: cdef int i for i in range(nr): vec[i] **= scal @staticmethod @cython.cdivision(True) cdef inline void div(weight_t* output, const weight_t* vec, weight_t scal, int32_t nr) nogil: memcpy(output, vec, sizeof(output[0]) * nr) Vec.div_i(output, scal, nr) @staticmethod @cython.cdivision(True) cdef inline void div_i(weight_t* vec, const weight_t scal, int32_t nr) nogil: cdef int i for i in range(nr): vec[i] /= scal @staticmethod cdef inline void exp(weight_t* output, const weight_t* vec, int32_t nr) nogil: memcpy(output, vec, sizeof(output[0]) * nr) Vec.exp_i(output, nr) @staticmethod cdef inline void exp_i(weight_t* vec, int32_t nr) nogil: cdef int i for i in range(nr): vec[i] = exp(vec[i]) @staticmethod cdef inline void reciprocal_i(weight_t* vec, int32_t nr) nogil: cdef int i for i in range(nr): vec[i] = 1.0 / vec[i] cdef class VecVec: @staticmethod cdef inline void add(weight_t* output, const weight_t* x, const weight_t* y, weight_t scale, int32_t nr) nogil: memcpy(output, x, sizeof(output[0]) * nr) VecVec.add_i(output, y, scale, nr) @staticmethod cdef inline void add_i(weight_t* x, const weight_t* y, weight_t scale, int32_t nr) nogil: cdef int i IF USE_BLAS: blis.axpyv(blis.NO_CONJUGATE, nr, scale, y, 1, x, 1) ELSE: for i in range(nr): x[i] += y[i] * scale @staticmethod cdef inline void batch_add_i(weight_t* x, const weight_t* y, weight_t scale, int32_t nr, int32_t nr_batch) nogil: # For fixed x, matrix of y cdef int i, _ for _ in range(nr_batch): VecVec.add_i(x, y, scale, nr) y += nr @staticmethod cdef inline void add_pow(weight_t* output, const weight_t* x, const weight_t* y, weight_t power, int32_t nr) nogil: memcpy(output, x, sizeof(output[0]) * nr) VecVec.add_pow_i(output, y, power, nr) @staticmethod cdef inline void add_pow_i(weight_t* x, const weight_t* y, weight_t power, int32_t nr) nogil: cdef int i for i in range(nr): x[i] += y[i] ** power @staticmethod cdef inline void mul(weight_t* output, const weight_t* x, const weight_t* y, int32_t nr) nogil: memcpy(output, x, sizeof(output[0]) * nr) VecVec.mul_i(output, y, nr) @staticmethod cdef inline void mul_i(weight_t* x, const weight_t* y, int32_t nr) nogil: cdef int i for i in range(nr): x[i] *= y[i] @staticmethod cdef inline weight_t dot( const weight_t* x, const weight_t* y, int32_t nr) nogil: cdef int i cdef weight_t total = 0 for i in range(nr): total += x[i] * y[i] return total @staticmethod cdef inline int arg_max_if_true( const weight_t* scores, const int* is_valid, const int n_classes) nogil: cdef int i cdef int best = -1 for i in range(n_classes): if is_valid[i] and (best == -1 or scores[i] > scores[best]): best = i return best @staticmethod cdef inline int arg_max_if_zero( const weight_t* scores, const weight_t* costs, const int n_classes) nogil: cdef int i cdef int best = -1 for i in range(n_classes): if costs[i] == 0 and (best == -1 or scores[i] > scores[best]): best = i return best cdef class Mat: @staticmethod cdef inline void mean_row(weight_t* Ex, const weight_t* mat, int32_t nr_row, int32_t nr_col) nogil: memset(Ex, 0, sizeof(Ex[0]) * nr_col) for i in range(nr_row): VecVec.add_i(Ex, &mat[i * nr_col], 1.0, nr_col) Vec.mul_i(Ex, 1.0 / nr_row, nr_col) @staticmethod cdef inline void var_row(weight_t* Vx, const weight_t* mat, const weight_t* Ex, int32_t nr_row, int32_t nr_col, weight_t eps) nogil: # From https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance if nr_row == 0 or nr_col == 0: return cdef weight_t sum_, sum2 for i in range(nr_col): sum_ = 0.0 sum2 = 0.0 for j in range(nr_row): x = mat[j * nr_col + i] sum2 += (x - Ex[i]) ** 2 sum_ += x - Ex[i] Vx[i] = (sum2 - sum_**2 / nr_row) / nr_row Vx[i] += eps cdef class MatVec: @staticmethod cdef inline void add_i(weight_t* mat, const weight_t* vec, weight_t scale, int32_t nr_row, int32_t nr_col) nogil: cdef int i for i in range(nr_row): VecVec.add_i(mat + (i * nr_col), vec, scale, nr_col) @staticmethod cdef inline void mul(weight_t* output, const weight_t* mat, const weight_t* vec, int32_t nr_row, int32_t nr_col) nogil: memcpy(output, mat, sizeof(output[0]) * nr_row * nr_col) MatVec.mul_i(output, vec, nr_row, nr_col) @staticmethod cdef inline void mul_i(weight_t* mat, const weight_t* vec, int32_t nr_row, int32_t nr_col) nogil: cdef int i, row, col for i in range(nr_row): row = i * nr_col for col in range(nr_col): mat[row + col] *= vec[col] @staticmethod cdef inline void dot(weight_t* output, const weight_t* mat, const weight_t* vec, int32_t nr_row, int32_t nr_col) nogil: cdef int i, row, col cdef double zero = 0.0 IF USE_BLAS: blis.gemv( blis.NO_TRANSPOSE, blis.NO_CONJUGATE, nr_row, nr_col, 1.0, mat, nr_col, 1, vec, 1, 1.0, output, 1 ) ELSE: for i in range(nr_row): row = i * nr_col for col in range(nr_col): output[i] += mat[row + col] * vec[col] @staticmethod cdef inline void batch_dot(weight_t* output, const weight_t* mat, const weight_t* vec, int32_t nr_row, int32_t nr_col, int32_t nr_batch) nogil: # Output dim: batch_size * nr_row # vec dim: batch_size * nr_col # mat dim: nr_row * nr_col # batch_size must be M, because can't transpose C # so nr_row must be N # so nr_col must be K # vec: M * K # mat.T: K * N # out: M * N cdef int i, row, col cdef double one = 1.0 IF USE_BLAS: blis.gemm( blis.NO_TRANSPOSE, blis.TRANSPOSE, nr_batch, nr_row, nr_col, 1.0, vec, nr_col, 1, mat, nr_col, 1, 1.0, output, nr_row, 1) ELSE: for b in range(nr_batch): MatVec.dot(output, mat, vec, nr_row, nr_col) output += nr_row vec += nr_col @staticmethod cdef inline void T_dot(weight_t* output, const weight_t* mat, const weight_t* vec, int32_t nr_row, int32_t nr_col) nogil: cdef int i, row, col cdef double zero = 0.0 cdef double one = 1.0 IF USE_BLAS: blis.gemv( blis.TRANSPOSE, blis.NO_CONJUGATE, nr_row, nr_col, 1.0, mat, nr_col, 1, vec, 1, 1.0, output, 1, ) ELSE: for row in range(nr_row): for col in range(nr_col): output[col] += vec[row] * mat[(row * nr_col) + col] @staticmethod cdef inline void batch_T_dot(weight_t* output, const weight_t* mat, const weight_t* vec, int32_t nr_row, int32_t nr_col, int32_t nr_batch) nogil: cdef int _ cdef double one = 1.0 IF USE_BLAS: # output is (nr_batch, nr_col) # mat is (nr_row, nr_col) # vec is (nr_batch, nr_row) # Output defined as (M, N) # So # nr_batch = M # nr_col = N # nr_row = K # # vec: M * K # mat: K * N # out: M * N blis.gemm( blis.NO_TRANSPOSE, blis.NO_TRANSPOSE, nr_batch, nr_col, nr_row, 1.0, vec, nr_row, 1, mat, nr_col, 1, 1.0, output, nr_col, 1) ELSE: for _ in range(nr_batch): MatVec.T_dot(output, mat, vec, nr_row, nr_col) output += nr_col vec += nr_row cdef class MatMat: @staticmethod cdef inline void add(weight_t* output, const weight_t* x, const weight_t* y, int32_t nr_row, int32_t nr_col) nogil: memcpy(output, x, sizeof(output[0]) * nr_row * nr_col) MatMat.add_i(output, y, nr_row, nr_col) @staticmethod cdef inline void add_i(weight_t* x, const weight_t* y, int32_t nr_row, int32_t nr_col) nogil: cdef int i, row, col for i in range(nr_row): row = i * nr_col for col in range(nr_col): x[row + col] += y[row + col] @staticmethod cdef inline void mul(weight_t* output, const weight_t* x, const weight_t* y, int32_t nr_row, int32_t nr_col) nogil: memcpy(output, x, sizeof(output[0]) * nr_row * nr_col) MatMat.mul_i(output, y, nr_row, nr_col) @staticmethod cdef inline void mul_i(weight_t* x, const weight_t* y, int32_t nr_row, int32_t nr_col) nogil: cdef int i, row, col for i in range(nr_row): row = i * nr_col for col in range(nr_col): x[row + col] *= y[row + col] @staticmethod cdef inline void add_outer_i(weight_t* mat, const weight_t* x, const weight_t* y, int32_t nr_row, int32_t nr_col) nogil: cdef int i, j, row cdef double one = 1.0 IF USE_BLAS: blis.ger( blis.NO_CONJUGATE, blis.NO_CONJUGATE, nr_row, nr_col, 1.0, x, 1, y, 1, mat, nr_col, 1 ) ELSE: for i in range(nr_row): row = i * nr_col for j in range(nr_col): mat[row + j] += x[i] * y[j] @staticmethod cdef inline void batch_add_outer_i(weight_t* output, const weight_t* x, const weight_t* y, int32_t nr_row, int32_t nr_col, int32_t nr_batch) nogil: # Output dim: nr_row * nr_col # x dim: batch_size * nr_row # y dim: batch_size * nr_col # # Output is M*N (can't transpose) # nr_row = M # nr_col = N # batch_size = K # x.T: M * K # y: K * N # out: M * N cdef double one = 1.0 IF USE_BLAS: blis.gemm( blis.TRANSPOSE, blis.NO_TRANSPOSE, nr_row, nr_col, nr_batch, 1.0, x, nr_row, 1, y, nr_col, 1, 1.0, output, nr_col, 1) ELSE: for _ in range(nr_batch): for i in range(nr_row): row = i * nr_col for j in range(nr_col): output[row + j] += x[i] * y[j] x += nr_row y += nr_col