|
|
- # cython: infer_types=True
- # cython: cdivision=True
-
- cimport cython
- from libc.stdint cimport int32_t
- from libc.string cimport memset, memcpy
- from cymem.cymem cimport Pool
-
-
- from .typedefs cimport weight_t
-
- include "compile_time_constants.pxi"
-
- IF USE_BLAS:
- from blis cimport cy as blis
-
- cdef extern from "math.h" nogil:
- weight_t exp(weight_t x)
- weight_t sqrt(weight_t x)
-
-
- cdef class Matrix:
- cdef readonly Pool mem
- cdef weight_t* data
- cdef readonly int32_t nr_row
- cdef readonly int32_t nr_col
-
-
- cdef class Vec:
- @staticmethod
- cdef inline int arg_max(const weight_t* scores, const int n_classes) nogil:
- if n_classes == 2:
- return 0 if scores[0] > scores[1] else 1
- cdef int i
- cdef int best = 0
- cdef weight_t mode = scores[0]
- for i in range(1, n_classes):
- if scores[i] > mode:
- mode = scores[i]
- best = i
- return best
-
- @staticmethod
- cdef inline weight_t max(const weight_t* x, int32_t nr) nogil:
- if nr == 0:
- return 0
- cdef int i
- cdef weight_t mode = x[0]
- for i in range(1, nr):
- if x[i] > mode:
- mode = x[i]
- return mode
-
- @staticmethod
- cdef inline weight_t sum(const weight_t* vec, int32_t nr) nogil:
- cdef int i
- cdef weight_t total = 0
- for i in range(nr):
- total += vec[i]
- return total
-
- @staticmethod
- cdef inline weight_t norm(const weight_t* vec, int32_t nr) nogil:
- cdef weight_t total = 0
- for i in range(nr):
- total += vec[i] ** 2
- return sqrt(total)
-
- @staticmethod
- cdef inline void add(weight_t* output, const weight_t* x,
- weight_t inc, int32_t nr) nogil:
- memcpy(output, x, sizeof(output[0]) * nr)
- Vec.add_i(output, inc, nr)
-
- @staticmethod
- cdef inline void add_i(weight_t* vec, weight_t inc, int32_t nr) nogil:
- cdef int i
- for i in range(nr):
- vec[i] += inc
-
- @staticmethod
- cdef inline void mul(weight_t* output, const weight_t* vec, weight_t scal,
- int32_t nr) nogil:
- memcpy(output, vec, sizeof(output[0]) * nr)
- Vec.mul_i(output, scal, nr)
-
- @staticmethod
- cdef inline void mul_i(weight_t* vec, weight_t scal, int32_t nr) nogil:
- cdef int i
- IF USE_BLAS:
- blis.scalv(blis.NO_CONJUGATE, nr, scal, vec, 1)
- ELSE:
- for i in range(nr):
- vec[i] *= scal
-
- @staticmethod
- cdef inline void pow(weight_t* output, const weight_t* vec, weight_t scal,
- int32_t nr) nogil:
- memcpy(output, vec, sizeof(output[0]) * nr)
- Vec.pow_i(output, scal, nr)
-
- @staticmethod
- cdef inline void pow_i(weight_t* vec, const weight_t scal, int32_t nr) nogil:
- cdef int i
- for i in range(nr):
- vec[i] **= scal
-
- @staticmethod
- @cython.cdivision(True)
- cdef inline void div(weight_t* output, const weight_t* vec, weight_t scal,
- int32_t nr) nogil:
- memcpy(output, vec, sizeof(output[0]) * nr)
- Vec.div_i(output, scal, nr)
-
- @staticmethod
- @cython.cdivision(True)
- cdef inline void div_i(weight_t* vec, const weight_t scal, int32_t nr) nogil:
- cdef int i
- for i in range(nr):
- vec[i] /= scal
-
- @staticmethod
- cdef inline void exp(weight_t* output, const weight_t* vec, int32_t nr) nogil:
- memcpy(output, vec, sizeof(output[0]) * nr)
- Vec.exp_i(output, nr)
-
- @staticmethod
- cdef inline void exp_i(weight_t* vec, int32_t nr) nogil:
- cdef int i
- for i in range(nr):
- vec[i] = exp(vec[i])
-
- @staticmethod
- cdef inline void reciprocal_i(weight_t* vec, int32_t nr) nogil:
- cdef int i
- for i in range(nr):
- vec[i] = 1.0 / vec[i]
-
-
- cdef class VecVec:
- @staticmethod
- cdef inline void add(weight_t* output,
- const weight_t* x,
- const weight_t* y,
- weight_t scale,
- int32_t nr) nogil:
- memcpy(output, x, sizeof(output[0]) * nr)
- VecVec.add_i(output, y, scale, nr)
-
- @staticmethod
- cdef inline void add_i(weight_t* x,
- const weight_t* y,
- weight_t scale,
- int32_t nr) nogil:
- cdef int i
- IF USE_BLAS:
- blis.axpyv(blis.NO_CONJUGATE, nr, scale, <weight_t*>y, 1, x, 1)
- ELSE:
- for i in range(nr):
- x[i] += y[i] * scale
-
- @staticmethod
- cdef inline void batch_add_i(weight_t* x,
- const weight_t* y,
- weight_t scale,
- int32_t nr, int32_t nr_batch) nogil:
- # For fixed x, matrix of y
- cdef int i, _
- for _ in range(nr_batch):
- VecVec.add_i(x,
- y, scale, nr)
- y += nr
-
- @staticmethod
- cdef inline void add_pow(weight_t* output,
- const weight_t* x, const weight_t* y, weight_t power, int32_t nr) nogil:
- memcpy(output, x, sizeof(output[0]) * nr)
- VecVec.add_pow_i(output, y, power, nr)
-
-
- @staticmethod
- cdef inline void add_pow_i(weight_t* x,
- const weight_t* y, weight_t power, int32_t nr) nogil:
- cdef int i
- for i in range(nr):
- x[i] += y[i] ** power
-
- @staticmethod
- cdef inline void mul(weight_t* output,
- const weight_t* x, const weight_t* y, int32_t nr) nogil:
- memcpy(output, x, sizeof(output[0]) * nr)
- VecVec.mul_i(output, y, nr)
-
- @staticmethod
- cdef inline void mul_i(weight_t* x,
- const weight_t* y, int32_t nr) nogil:
- cdef int i
- for i in range(nr):
- x[i] *= y[i]
-
- @staticmethod
- cdef inline weight_t dot(
- const weight_t* x, const weight_t* y, int32_t nr) nogil:
- cdef int i
- cdef weight_t total = 0
- for i in range(nr):
- total += x[i] * y[i]
- return total
-
- @staticmethod
- cdef inline int arg_max_if_true(
- const weight_t* scores, const int* is_valid, const int n_classes) nogil:
- cdef int i
- cdef int best = -1
- for i in range(n_classes):
- if is_valid[i] and (best == -1 or scores[i] > scores[best]):
- best = i
- return best
-
- @staticmethod
- cdef inline int arg_max_if_zero(
- const weight_t* scores, const weight_t* costs, const int n_classes) nogil:
- cdef int i
- cdef int best = -1
- for i in range(n_classes):
- if costs[i] == 0 and (best == -1 or scores[i] > scores[best]):
- best = i
- return best
-
-
- cdef class Mat:
- @staticmethod
- cdef inline void mean_row(weight_t* Ex,
- const weight_t* mat, int32_t nr_row, int32_t nr_col) nogil:
- memset(Ex, 0, sizeof(Ex[0]) * nr_col)
- for i in range(nr_row):
- VecVec.add_i(Ex, &mat[i * nr_col], 1.0, nr_col)
- Vec.mul_i(Ex, 1.0 / nr_row, nr_col)
-
- @staticmethod
- cdef inline void var_row(weight_t* Vx,
- const weight_t* mat, const weight_t* Ex,
- int32_t nr_row, int32_t nr_col, weight_t eps) nogil:
- # From https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
- if nr_row == 0 or nr_col == 0:
- return
- cdef weight_t sum_, sum2
- for i in range(nr_col):
- sum_ = 0.0
- sum2 = 0.0
- for j in range(nr_row):
- x = mat[j * nr_col + i]
- sum2 += (x - Ex[i]) ** 2
- sum_ += x - Ex[i]
- Vx[i] = (sum2 - sum_**2 / nr_row) / nr_row
- Vx[i] += eps
-
-
- cdef class MatVec:
- @staticmethod
- cdef inline void add_i(weight_t* mat,
- const weight_t* vec, weight_t scale, int32_t nr_row, int32_t nr_col) nogil:
- cdef int i
- for i in range(nr_row):
- VecVec.add_i(mat + (i * nr_col),
- vec, scale, nr_col)
-
- @staticmethod
- cdef inline void mul(weight_t* output,
- const weight_t* mat,
- const weight_t* vec,
- int32_t nr_row, int32_t nr_col) nogil:
- memcpy(output, mat, sizeof(output[0]) * nr_row * nr_col)
- MatVec.mul_i(output, vec, nr_row, nr_col)
-
- @staticmethod
- cdef inline void mul_i(weight_t* mat,
- const weight_t* vec,
- int32_t nr_row, int32_t nr_col) nogil:
- cdef int i, row, col
- for i in range(nr_row):
- row = i * nr_col
- for col in range(nr_col):
- mat[row + col] *= vec[col]
-
- @staticmethod
- cdef inline void dot(weight_t* output,
- const weight_t* mat,
- const weight_t* vec,
- int32_t nr_row, int32_t nr_col) nogil:
- cdef int i, row, col
- cdef double zero = 0.0
- IF USE_BLAS:
- blis.gemv(
- blis.NO_TRANSPOSE,
- blis.NO_CONJUGATE,
- nr_row,
- nr_col,
- 1.0,
- <weight_t*>mat, nr_col, 1,
- <weight_t*>vec, 1,
- 1.0,
- output, 1
- )
- ELSE:
- for i in range(nr_row):
- row = i * nr_col
- for col in range(nr_col):
- output[i] += mat[row + col] * vec[col]
-
- @staticmethod
- cdef inline void batch_dot(weight_t* output,
- const weight_t* mat,
- const weight_t* vec,
- int32_t nr_row, int32_t nr_col, int32_t nr_batch) nogil:
- # Output dim: batch_size * nr_row
- # vec dim: batch_size * nr_col
- # mat dim: nr_row * nr_col
- # batch_size must be M, because can't transpose C
- # so nr_row must be N
- # so nr_col must be K
-
- # vec: M * K
- # mat.T: K * N
- # out: M * N
- cdef int i, row, col
- cdef double one = 1.0
- IF USE_BLAS:
- blis.gemm(
- blis.NO_TRANSPOSE,
- blis.TRANSPOSE,
- nr_batch,
- nr_row,
- nr_col,
- 1.0,
- <weight_t*>vec,
- nr_col,
- 1,
- <weight_t*>mat,
- nr_col,
- 1,
- 1.0,
- output,
- nr_row,
- 1)
- ELSE:
- for b in range(nr_batch):
- MatVec.dot(output,
- mat, vec, nr_row, nr_col)
- output += nr_row
- vec += nr_col
-
- @staticmethod
- cdef inline void T_dot(weight_t* output,
- const weight_t* mat,
- const weight_t* vec,
- int32_t nr_row,
- int32_t nr_col) nogil:
- cdef int i, row, col
- cdef double zero = 0.0
- cdef double one = 1.0
- IF USE_BLAS:
- blis.gemv(
- blis.TRANSPOSE,
- blis.NO_CONJUGATE,
- nr_row, nr_col,
- 1.0,
- <weight_t*>mat, nr_col, 1,
- <weight_t*>vec, 1,
- 1.0,
- output, 1,
- )
- ELSE:
- for row in range(nr_row):
- for col in range(nr_col):
- output[col] += vec[row] * mat[(row * nr_col) + col]
-
- @staticmethod
- cdef inline void batch_T_dot(weight_t* output,
- const weight_t* mat,
- const weight_t* vec,
- int32_t nr_row,
- int32_t nr_col,
- int32_t nr_batch) nogil:
- cdef int _
- cdef double one = 1.0
- IF USE_BLAS:
- # output is (nr_batch, nr_col)
- # mat is (nr_row, nr_col)
- # vec is (nr_batch, nr_row)
- # Output defined as (M, N)
- # So
- # nr_batch = M
- # nr_col = N
- # nr_row = K
- #
- # vec: M * K
- # mat: K * N
- # out: M * N
- blis.gemm(
- blis.NO_TRANSPOSE,
- blis.NO_TRANSPOSE,
- nr_batch,
- nr_col,
- nr_row,
- 1.0,
- <weight_t*>vec,
- nr_row,
- 1,
- <weight_t*>mat,
- nr_col,
- 1,
- 1.0,
- output,
- nr_col,
- 1)
- ELSE:
- for _ in range(nr_batch):
- MatVec.T_dot(output,
- mat, vec, nr_row, nr_col)
- output += nr_col
- vec += nr_row
-
-
- cdef class MatMat:
- @staticmethod
- cdef inline void add(weight_t* output,
- const weight_t* x,
- const weight_t* y,
- int32_t nr_row, int32_t nr_col) nogil:
- memcpy(output, x, sizeof(output[0]) * nr_row * nr_col)
- MatMat.add_i(output, y, nr_row, nr_col)
-
- @staticmethod
- cdef inline void add_i(weight_t* x,
- const weight_t* y,
- int32_t nr_row, int32_t nr_col) nogil:
- cdef int i, row, col
- for i in range(nr_row):
- row = i * nr_col
- for col in range(nr_col):
- x[row + col] += y[row + col]
-
- @staticmethod
- cdef inline void mul(weight_t* output,
- const weight_t* x,
- const weight_t* y,
- int32_t nr_row, int32_t nr_col) nogil:
- memcpy(output, x, sizeof(output[0]) * nr_row * nr_col)
- MatMat.mul_i(output, y, nr_row, nr_col)
-
- @staticmethod
- cdef inline void mul_i(weight_t* x,
- const weight_t* y,
- int32_t nr_row, int32_t nr_col) nogil:
- cdef int i, row, col
- for i in range(nr_row):
- row = i * nr_col
- for col in range(nr_col):
- x[row + col] *= y[row + col]
-
- @staticmethod
- cdef inline void add_outer_i(weight_t* mat,
- const weight_t* x,
- const weight_t* y,
- int32_t nr_row,
- int32_t nr_col) nogil:
- cdef int i, j, row
- cdef double one = 1.0
- IF USE_BLAS:
- blis.ger(
- blis.NO_CONJUGATE, blis.NO_CONJUGATE,
- nr_row, nr_col,
- 1.0,
- <weight_t*>x, 1,
- <weight_t*>y, 1,
- mat, nr_col, 1
- )
- ELSE:
- for i in range(nr_row):
- row = i * nr_col
- for j in range(nr_col):
- mat[row + j] += x[i] * y[j]
-
- @staticmethod
- cdef inline void batch_add_outer_i(weight_t* output,
- const weight_t* x,
- const weight_t* y,
- int32_t nr_row,
- int32_t nr_col,
- int32_t nr_batch) nogil:
- # Output dim: nr_row * nr_col
- # x dim: batch_size * nr_row
- # y dim: batch_size * nr_col
- #
- # Output is M*N (can't transpose)
- # nr_row = M
- # nr_col = N
- # batch_size = K
-
- # x.T: M * K
- # y: K * N
- # out: M * N
- cdef double one = 1.0
- IF USE_BLAS:
- blis.gemm(
- blis.TRANSPOSE,
- blis.NO_TRANSPOSE,
- nr_row,
- nr_col,
- nr_batch,
- 1.0,
- <weight_t*>x,
- nr_row,
- 1,
- <weight_t*>y,
- nr_col,
- 1,
- 1.0,
- output,
- nr_col,
- 1)
- ELSE:
- for _ in range(nr_batch):
- for i in range(nr_row):
- row = i * nr_col
- for j in range(nr_col):
- output[row + j] += x[i] * y[j]
- x += nr_row
- y += nr_col
|