# cython: infer_types=True
|
|
# cython: cdivision=True
|
|
|
|
cimport cython
|
|
from libc.stdint cimport int32_t
|
|
from libc.string cimport memset, memcpy
|
|
from cymem.cymem cimport Pool
|
|
|
|
|
|
from .typedefs cimport weight_t
|
|
|
|
include "compile_time_constants.pxi"
|
|
|
|
IF USE_BLAS:
|
|
from blis cimport cy as blis
|
|
|
|
cdef extern from "math.h" nogil:
|
|
weight_t exp(weight_t x)
|
|
weight_t sqrt(weight_t x)
|
|
|
|
|
|
cdef class Matrix:
|
|
cdef readonly Pool mem
|
|
cdef weight_t* data
|
|
cdef readonly int32_t nr_row
|
|
cdef readonly int32_t nr_col
|
|
|
|
|
|
cdef class Vec:
|
|
@staticmethod
|
|
cdef inline int arg_max(const weight_t* scores, const int n_classes) nogil:
|
|
if n_classes == 2:
|
|
return 0 if scores[0] > scores[1] else 1
|
|
cdef int i
|
|
cdef int best = 0
|
|
cdef weight_t mode = scores[0]
|
|
for i in range(1, n_classes):
|
|
if scores[i] > mode:
|
|
mode = scores[i]
|
|
best = i
|
|
return best
|
|
|
|
@staticmethod
|
|
cdef inline weight_t max(const weight_t* x, int32_t nr) nogil:
|
|
if nr == 0:
|
|
return 0
|
|
cdef int i
|
|
cdef weight_t mode = x[0]
|
|
for i in range(1, nr):
|
|
if x[i] > mode:
|
|
mode = x[i]
|
|
return mode
|
|
|
|
@staticmethod
|
|
cdef inline weight_t sum(const weight_t* vec, int32_t nr) nogil:
|
|
cdef int i
|
|
cdef weight_t total = 0
|
|
for i in range(nr):
|
|
total += vec[i]
|
|
return total
|
|
|
|
@staticmethod
|
|
cdef inline weight_t norm(const weight_t* vec, int32_t nr) nogil:
|
|
cdef weight_t total = 0
|
|
for i in range(nr):
|
|
total += vec[i] ** 2
|
|
return sqrt(total)
|
|
|
|
@staticmethod
|
|
cdef inline void add(weight_t* output, const weight_t* x,
|
|
weight_t inc, int32_t nr) nogil:
|
|
memcpy(output, x, sizeof(output[0]) * nr)
|
|
Vec.add_i(output, inc, nr)
|
|
|
|
@staticmethod
|
|
cdef inline void add_i(weight_t* vec, weight_t inc, int32_t nr) nogil:
|
|
cdef int i
|
|
for i in range(nr):
|
|
vec[i] += inc
|
|
|
|
@staticmethod
|
|
cdef inline void mul(weight_t* output, const weight_t* vec, weight_t scal,
|
|
int32_t nr) nogil:
|
|
memcpy(output, vec, sizeof(output[0]) * nr)
|
|
Vec.mul_i(output, scal, nr)
|
|
|
|
@staticmethod
|
|
cdef inline void mul_i(weight_t* vec, weight_t scal, int32_t nr) nogil:
|
|
cdef int i
|
|
IF USE_BLAS:
|
|
blis.scalv(blis.NO_CONJUGATE, nr, scal, vec, 1)
|
|
ELSE:
|
|
for i in range(nr):
|
|
vec[i] *= scal
|
|
|
|
@staticmethod
|
|
cdef inline void pow(weight_t* output, const weight_t* vec, weight_t scal,
|
|
int32_t nr) nogil:
|
|
memcpy(output, vec, sizeof(output[0]) * nr)
|
|
Vec.pow_i(output, scal, nr)
|
|
|
|
@staticmethod
|
|
cdef inline void pow_i(weight_t* vec, const weight_t scal, int32_t nr) nogil:
|
|
cdef int i
|
|
for i in range(nr):
|
|
vec[i] **= scal
|
|
|
|
@staticmethod
|
|
@cython.cdivision(True)
|
|
cdef inline void div(weight_t* output, const weight_t* vec, weight_t scal,
|
|
int32_t nr) nogil:
|
|
memcpy(output, vec, sizeof(output[0]) * nr)
|
|
Vec.div_i(output, scal, nr)
|
|
|
|
@staticmethod
|
|
@cython.cdivision(True)
|
|
cdef inline void div_i(weight_t* vec, const weight_t scal, int32_t nr) nogil:
|
|
cdef int i
|
|
for i in range(nr):
|
|
vec[i] /= scal
|
|
|
|
@staticmethod
|
|
cdef inline void exp(weight_t* output, const weight_t* vec, int32_t nr) nogil:
|
|
memcpy(output, vec, sizeof(output[0]) * nr)
|
|
Vec.exp_i(output, nr)
|
|
|
|
@staticmethod
|
|
cdef inline void exp_i(weight_t* vec, int32_t nr) nogil:
|
|
cdef int i
|
|
for i in range(nr):
|
|
vec[i] = exp(vec[i])
|
|
|
|
@staticmethod
|
|
cdef inline void reciprocal_i(weight_t* vec, int32_t nr) nogil:
|
|
cdef int i
|
|
for i in range(nr):
|
|
vec[i] = 1.0 / vec[i]
|
|
|
|
|
|
cdef class VecVec:
|
|
@staticmethod
|
|
cdef inline void add(weight_t* output,
|
|
const weight_t* x,
|
|
const weight_t* y,
|
|
weight_t scale,
|
|
int32_t nr) nogil:
|
|
memcpy(output, x, sizeof(output[0]) * nr)
|
|
VecVec.add_i(output, y, scale, nr)
|
|
|
|
@staticmethod
|
|
cdef inline void add_i(weight_t* x,
|
|
const weight_t* y,
|
|
weight_t scale,
|
|
int32_t nr) nogil:
|
|
cdef int i
|
|
IF USE_BLAS:
|
|
blis.axpyv(blis.NO_CONJUGATE, nr, scale, <weight_t*>y, 1, x, 1)
|
|
ELSE:
|
|
for i in range(nr):
|
|
x[i] += y[i] * scale
|
|
|
|
@staticmethod
|
|
cdef inline void batch_add_i(weight_t* x,
|
|
const weight_t* y,
|
|
weight_t scale,
|
|
int32_t nr, int32_t nr_batch) nogil:
|
|
# For fixed x, matrix of y
|
|
cdef int i, _
|
|
for _ in range(nr_batch):
|
|
VecVec.add_i(x,
|
|
y, scale, nr)
|
|
y += nr
|
|
|
|
@staticmethod
|
|
cdef inline void add_pow(weight_t* output,
|
|
const weight_t* x, const weight_t* y, weight_t power, int32_t nr) nogil:
|
|
memcpy(output, x, sizeof(output[0]) * nr)
|
|
VecVec.add_pow_i(output, y, power, nr)
|
|
|
|
|
|
@staticmethod
|
|
cdef inline void add_pow_i(weight_t* x,
|
|
const weight_t* y, weight_t power, int32_t nr) nogil:
|
|
cdef int i
|
|
for i in range(nr):
|
|
x[i] += y[i] ** power
|
|
|
|
@staticmethod
|
|
cdef inline void mul(weight_t* output,
|
|
const weight_t* x, const weight_t* y, int32_t nr) nogil:
|
|
memcpy(output, x, sizeof(output[0]) * nr)
|
|
VecVec.mul_i(output, y, nr)
|
|
|
|
@staticmethod
|
|
cdef inline void mul_i(weight_t* x,
|
|
const weight_t* y, int32_t nr) nogil:
|
|
cdef int i
|
|
for i in range(nr):
|
|
x[i] *= y[i]
|
|
|
|
@staticmethod
|
|
cdef inline weight_t dot(
|
|
const weight_t* x, const weight_t* y, int32_t nr) nogil:
|
|
cdef int i
|
|
cdef weight_t total = 0
|
|
for i in range(nr):
|
|
total += x[i] * y[i]
|
|
return total
|
|
|
|
@staticmethod
|
|
cdef inline int arg_max_if_true(
|
|
const weight_t* scores, const int* is_valid, const int n_classes) nogil:
|
|
cdef int i
|
|
cdef int best = -1
|
|
for i in range(n_classes):
|
|
if is_valid[i] and (best == -1 or scores[i] > scores[best]):
|
|
best = i
|
|
return best
|
|
|
|
@staticmethod
|
|
cdef inline int arg_max_if_zero(
|
|
const weight_t* scores, const weight_t* costs, const int n_classes) nogil:
|
|
cdef int i
|
|
cdef int best = -1
|
|
for i in range(n_classes):
|
|
if costs[i] == 0 and (best == -1 or scores[i] > scores[best]):
|
|
best = i
|
|
return best
|
|
|
|
|
|
cdef class Mat:
|
|
@staticmethod
|
|
cdef inline void mean_row(weight_t* Ex,
|
|
const weight_t* mat, int32_t nr_row, int32_t nr_col) nogil:
|
|
memset(Ex, 0, sizeof(Ex[0]) * nr_col)
|
|
for i in range(nr_row):
|
|
VecVec.add_i(Ex, &mat[i * nr_col], 1.0, nr_col)
|
|
Vec.mul_i(Ex, 1.0 / nr_row, nr_col)
|
|
|
|
@staticmethod
|
|
cdef inline void var_row(weight_t* Vx,
|
|
const weight_t* mat, const weight_t* Ex,
|
|
int32_t nr_row, int32_t nr_col, weight_t eps) nogil:
|
|
# From https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
|
|
if nr_row == 0 or nr_col == 0:
|
|
return
|
|
cdef weight_t sum_, sum2
|
|
for i in range(nr_col):
|
|
sum_ = 0.0
|
|
sum2 = 0.0
|
|
for j in range(nr_row):
|
|
x = mat[j * nr_col + i]
|
|
sum2 += (x - Ex[i]) ** 2
|
|
sum_ += x - Ex[i]
|
|
Vx[i] = (sum2 - sum_**2 / nr_row) / nr_row
|
|
Vx[i] += eps
|
|
|
|
|
|
cdef class MatVec:
|
|
@staticmethod
|
|
cdef inline void add_i(weight_t* mat,
|
|
const weight_t* vec, weight_t scale, int32_t nr_row, int32_t nr_col) nogil:
|
|
cdef int i
|
|
for i in range(nr_row):
|
|
VecVec.add_i(mat + (i * nr_col),
|
|
vec, scale, nr_col)
|
|
|
|
@staticmethod
|
|
cdef inline void mul(weight_t* output,
|
|
const weight_t* mat,
|
|
const weight_t* vec,
|
|
int32_t nr_row, int32_t nr_col) nogil:
|
|
memcpy(output, mat, sizeof(output[0]) * nr_row * nr_col)
|
|
MatVec.mul_i(output, vec, nr_row, nr_col)
|
|
|
|
@staticmethod
|
|
cdef inline void mul_i(weight_t* mat,
|
|
const weight_t* vec,
|
|
int32_t nr_row, int32_t nr_col) nogil:
|
|
cdef int i, row, col
|
|
for i in range(nr_row):
|
|
row = i * nr_col
|
|
for col in range(nr_col):
|
|
mat[row + col] *= vec[col]
|
|
|
|
@staticmethod
|
|
cdef inline void dot(weight_t* output,
|
|
const weight_t* mat,
|
|
const weight_t* vec,
|
|
int32_t nr_row, int32_t nr_col) nogil:
|
|
cdef int i, row, col
|
|
cdef double zero = 0.0
|
|
IF USE_BLAS:
|
|
blis.gemv(
|
|
blis.NO_TRANSPOSE,
|
|
blis.NO_CONJUGATE,
|
|
nr_row,
|
|
nr_col,
|
|
1.0,
|
|
<weight_t*>mat, nr_col, 1,
|
|
<weight_t*>vec, 1,
|
|
1.0,
|
|
output, 1
|
|
)
|
|
ELSE:
|
|
for i in range(nr_row):
|
|
row = i * nr_col
|
|
for col in range(nr_col):
|
|
output[i] += mat[row + col] * vec[col]
|
|
|
|
@staticmethod
|
|
cdef inline void batch_dot(weight_t* output,
|
|
const weight_t* mat,
|
|
const weight_t* vec,
|
|
int32_t nr_row, int32_t nr_col, int32_t nr_batch) nogil:
|
|
# Output dim: batch_size * nr_row
|
|
# vec dim: batch_size * nr_col
|
|
# mat dim: nr_row * nr_col
|
|
# batch_size must be M, because can't transpose C
|
|
# so nr_row must be N
|
|
# so nr_col must be K
|
|
|
|
# vec: M * K
|
|
# mat.T: K * N
|
|
# out: M * N
|
|
cdef int i, row, col
|
|
cdef double one = 1.0
|
|
IF USE_BLAS:
|
|
blis.gemm(
|
|
blis.NO_TRANSPOSE,
|
|
blis.TRANSPOSE,
|
|
nr_batch,
|
|
nr_row,
|
|
nr_col,
|
|
1.0,
|
|
<weight_t*>vec,
|
|
nr_col,
|
|
1,
|
|
<weight_t*>mat,
|
|
nr_col,
|
|
1,
|
|
1.0,
|
|
output,
|
|
nr_row,
|
|
1)
|
|
ELSE:
|
|
for b in range(nr_batch):
|
|
MatVec.dot(output,
|
|
mat, vec, nr_row, nr_col)
|
|
output += nr_row
|
|
vec += nr_col
|
|
|
|
@staticmethod
|
|
cdef inline void T_dot(weight_t* output,
|
|
const weight_t* mat,
|
|
const weight_t* vec,
|
|
int32_t nr_row,
|
|
int32_t nr_col) nogil:
|
|
cdef int i, row, col
|
|
cdef double zero = 0.0
|
|
cdef double one = 1.0
|
|
IF USE_BLAS:
|
|
blis.gemv(
|
|
blis.TRANSPOSE,
|
|
blis.NO_CONJUGATE,
|
|
nr_row, nr_col,
|
|
1.0,
|
|
<weight_t*>mat, nr_col, 1,
|
|
<weight_t*>vec, 1,
|
|
1.0,
|
|
output, 1,
|
|
)
|
|
ELSE:
|
|
for row in range(nr_row):
|
|
for col in range(nr_col):
|
|
output[col] += vec[row] * mat[(row * nr_col) + col]
|
|
|
|
@staticmethod
|
|
cdef inline void batch_T_dot(weight_t* output,
|
|
const weight_t* mat,
|
|
const weight_t* vec,
|
|
int32_t nr_row,
|
|
int32_t nr_col,
|
|
int32_t nr_batch) nogil:
|
|
cdef int _
|
|
cdef double one = 1.0
|
|
IF USE_BLAS:
|
|
# output is (nr_batch, nr_col)
|
|
# mat is (nr_row, nr_col)
|
|
# vec is (nr_batch, nr_row)
|
|
# Output defined as (M, N)
|
|
# So
|
|
# nr_batch = M
|
|
# nr_col = N
|
|
# nr_row = K
|
|
#
|
|
# vec: M * K
|
|
# mat: K * N
|
|
# out: M * N
|
|
blis.gemm(
|
|
blis.NO_TRANSPOSE,
|
|
blis.NO_TRANSPOSE,
|
|
nr_batch,
|
|
nr_col,
|
|
nr_row,
|
|
1.0,
|
|
<weight_t*>vec,
|
|
nr_row,
|
|
1,
|
|
<weight_t*>mat,
|
|
nr_col,
|
|
1,
|
|
1.0,
|
|
output,
|
|
nr_col,
|
|
1)
|
|
ELSE:
|
|
for _ in range(nr_batch):
|
|
MatVec.T_dot(output,
|
|
mat, vec, nr_row, nr_col)
|
|
output += nr_col
|
|
vec += nr_row
|
|
|
|
|
|
cdef class MatMat:
|
|
@staticmethod
|
|
cdef inline void add(weight_t* output,
|
|
const weight_t* x,
|
|
const weight_t* y,
|
|
int32_t nr_row, int32_t nr_col) nogil:
|
|
memcpy(output, x, sizeof(output[0]) * nr_row * nr_col)
|
|
MatMat.add_i(output, y, nr_row, nr_col)
|
|
|
|
@staticmethod
|
|
cdef inline void add_i(weight_t* x,
|
|
const weight_t* y,
|
|
int32_t nr_row, int32_t nr_col) nogil:
|
|
cdef int i, row, col
|
|
for i in range(nr_row):
|
|
row = i * nr_col
|
|
for col in range(nr_col):
|
|
x[row + col] += y[row + col]
|
|
|
|
@staticmethod
|
|
cdef inline void mul(weight_t* output,
|
|
const weight_t* x,
|
|
const weight_t* y,
|
|
int32_t nr_row, int32_t nr_col) nogil:
|
|
memcpy(output, x, sizeof(output[0]) * nr_row * nr_col)
|
|
MatMat.mul_i(output, y, nr_row, nr_col)
|
|
|
|
@staticmethod
|
|
cdef inline void mul_i(weight_t* x,
|
|
const weight_t* y,
|
|
int32_t nr_row, int32_t nr_col) nogil:
|
|
cdef int i, row, col
|
|
for i in range(nr_row):
|
|
row = i * nr_col
|
|
for col in range(nr_col):
|
|
x[row + col] *= y[row + col]
|
|
|
|
@staticmethod
|
|
cdef inline void add_outer_i(weight_t* mat,
|
|
const weight_t* x,
|
|
const weight_t* y,
|
|
int32_t nr_row,
|
|
int32_t nr_col) nogil:
|
|
cdef int i, j, row
|
|
cdef double one = 1.0
|
|
IF USE_BLAS:
|
|
blis.ger(
|
|
blis.NO_CONJUGATE, blis.NO_CONJUGATE,
|
|
nr_row, nr_col,
|
|
1.0,
|
|
<weight_t*>x, 1,
|
|
<weight_t*>y, 1,
|
|
mat, nr_col, 1
|
|
)
|
|
ELSE:
|
|
for i in range(nr_row):
|
|
row = i * nr_col
|
|
for j in range(nr_col):
|
|
mat[row + j] += x[i] * y[j]
|
|
|
|
@staticmethod
|
|
cdef inline void batch_add_outer_i(weight_t* output,
|
|
const weight_t* x,
|
|
const weight_t* y,
|
|
int32_t nr_row,
|
|
int32_t nr_col,
|
|
int32_t nr_batch) nogil:
|
|
# Output dim: nr_row * nr_col
|
|
# x dim: batch_size * nr_row
|
|
# y dim: batch_size * nr_col
|
|
#
|
|
# Output is M*N (can't transpose)
|
|
# nr_row = M
|
|
# nr_col = N
|
|
# batch_size = K
|
|
|
|
# x.T: M * K
|
|
# y: K * N
|
|
# out: M * N
|
|
cdef double one = 1.0
|
|
IF USE_BLAS:
|
|
blis.gemm(
|
|
blis.TRANSPOSE,
|
|
blis.NO_TRANSPOSE,
|
|
nr_row,
|
|
nr_col,
|
|
nr_batch,
|
|
1.0,
|
|
<weight_t*>x,
|
|
nr_row,
|
|
1,
|
|
<weight_t*>y,
|
|
nr_col,
|
|
1,
|
|
1.0,
|
|
output,
|
|
nr_col,
|
|
1)
|
|
ELSE:
|
|
for _ in range(nr_batch):
|
|
for i in range(nr_row):
|
|
row = i * nr_col
|
|
for j in range(nr_col):
|
|
output[row + j] += x[i] * y[j]
|
|
x += nr_row
|
|
y += nr_col
|