You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

530 lines
16 KiB

# cython: infer_types=True
# cython: cdivision=True
cimport cython
from libc.stdint cimport int32_t
from libc.string cimport memset, memcpy
from cymem.cymem cimport Pool
from .typedefs cimport weight_t
include "compile_time_constants.pxi"
IF USE_BLAS:
from blis cimport cy as blis
cdef extern from "math.h" nogil:
weight_t exp(weight_t x)
weight_t sqrt(weight_t x)
cdef class Matrix:
cdef readonly Pool mem
cdef weight_t* data
cdef readonly int32_t nr_row
cdef readonly int32_t nr_col
cdef class Vec:
@staticmethod
cdef inline int arg_max(const weight_t* scores, const int n_classes) nogil:
if n_classes == 2:
return 0 if scores[0] > scores[1] else 1
cdef int i
cdef int best = 0
cdef weight_t mode = scores[0]
for i in range(1, n_classes):
if scores[i] > mode:
mode = scores[i]
best = i
return best
@staticmethod
cdef inline weight_t max(const weight_t* x, int32_t nr) nogil:
if nr == 0:
return 0
cdef int i
cdef weight_t mode = x[0]
for i in range(1, nr):
if x[i] > mode:
mode = x[i]
return mode
@staticmethod
cdef inline weight_t sum(const weight_t* vec, int32_t nr) nogil:
cdef int i
cdef weight_t total = 0
for i in range(nr):
total += vec[i]
return total
@staticmethod
cdef inline weight_t norm(const weight_t* vec, int32_t nr) nogil:
cdef weight_t total = 0
for i in range(nr):
total += vec[i] ** 2
return sqrt(total)
@staticmethod
cdef inline void add(weight_t* output, const weight_t* x,
weight_t inc, int32_t nr) nogil:
memcpy(output, x, sizeof(output[0]) * nr)
Vec.add_i(output, inc, nr)
@staticmethod
cdef inline void add_i(weight_t* vec, weight_t inc, int32_t nr) nogil:
cdef int i
for i in range(nr):
vec[i] += inc
@staticmethod
cdef inline void mul(weight_t* output, const weight_t* vec, weight_t scal,
int32_t nr) nogil:
memcpy(output, vec, sizeof(output[0]) * nr)
Vec.mul_i(output, scal, nr)
@staticmethod
cdef inline void mul_i(weight_t* vec, weight_t scal, int32_t nr) nogil:
cdef int i
IF USE_BLAS:
blis.scalv(blis.NO_CONJUGATE, nr, scal, vec, 1)
ELSE:
for i in range(nr):
vec[i] *= scal
@staticmethod
cdef inline void pow(weight_t* output, const weight_t* vec, weight_t scal,
int32_t nr) nogil:
memcpy(output, vec, sizeof(output[0]) * nr)
Vec.pow_i(output, scal, nr)
@staticmethod
cdef inline void pow_i(weight_t* vec, const weight_t scal, int32_t nr) nogil:
cdef int i
for i in range(nr):
vec[i] **= scal
@staticmethod
@cython.cdivision(True)
cdef inline void div(weight_t* output, const weight_t* vec, weight_t scal,
int32_t nr) nogil:
memcpy(output, vec, sizeof(output[0]) * nr)
Vec.div_i(output, scal, nr)
@staticmethod
@cython.cdivision(True)
cdef inline void div_i(weight_t* vec, const weight_t scal, int32_t nr) nogil:
cdef int i
for i in range(nr):
vec[i] /= scal
@staticmethod
cdef inline void exp(weight_t* output, const weight_t* vec, int32_t nr) nogil:
memcpy(output, vec, sizeof(output[0]) * nr)
Vec.exp_i(output, nr)
@staticmethod
cdef inline void exp_i(weight_t* vec, int32_t nr) nogil:
cdef int i
for i in range(nr):
vec[i] = exp(vec[i])
@staticmethod
cdef inline void reciprocal_i(weight_t* vec, int32_t nr) nogil:
cdef int i
for i in range(nr):
vec[i] = 1.0 / vec[i]
cdef class VecVec:
@staticmethod
cdef inline void add(weight_t* output,
const weight_t* x,
const weight_t* y,
weight_t scale,
int32_t nr) nogil:
memcpy(output, x, sizeof(output[0]) * nr)
VecVec.add_i(output, y, scale, nr)
@staticmethod
cdef inline void add_i(weight_t* x,
const weight_t* y,
weight_t scale,
int32_t nr) nogil:
cdef int i
IF USE_BLAS:
blis.axpyv(blis.NO_CONJUGATE, nr, scale, <weight_t*>y, 1, x, 1)
ELSE:
for i in range(nr):
x[i] += y[i] * scale
@staticmethod
cdef inline void batch_add_i(weight_t* x,
const weight_t* y,
weight_t scale,
int32_t nr, int32_t nr_batch) nogil:
# For fixed x, matrix of y
cdef int i, _
for _ in range(nr_batch):
VecVec.add_i(x,
y, scale, nr)
y += nr
@staticmethod
cdef inline void add_pow(weight_t* output,
const weight_t* x, const weight_t* y, weight_t power, int32_t nr) nogil:
memcpy(output, x, sizeof(output[0]) * nr)
VecVec.add_pow_i(output, y, power, nr)
@staticmethod
cdef inline void add_pow_i(weight_t* x,
const weight_t* y, weight_t power, int32_t nr) nogil:
cdef int i
for i in range(nr):
x[i] += y[i] ** power
@staticmethod
cdef inline void mul(weight_t* output,
const weight_t* x, const weight_t* y, int32_t nr) nogil:
memcpy(output, x, sizeof(output[0]) * nr)
VecVec.mul_i(output, y, nr)
@staticmethod
cdef inline void mul_i(weight_t* x,
const weight_t* y, int32_t nr) nogil:
cdef int i
for i in range(nr):
x[i] *= y[i]
@staticmethod
cdef inline weight_t dot(
const weight_t* x, const weight_t* y, int32_t nr) nogil:
cdef int i
cdef weight_t total = 0
for i in range(nr):
total += x[i] * y[i]
return total
@staticmethod
cdef inline int arg_max_if_true(
const weight_t* scores, const int* is_valid, const int n_classes) nogil:
cdef int i
cdef int best = -1
for i in range(n_classes):
if is_valid[i] and (best == -1 or scores[i] > scores[best]):
best = i
return best
@staticmethod
cdef inline int arg_max_if_zero(
const weight_t* scores, const weight_t* costs, const int n_classes) nogil:
cdef int i
cdef int best = -1
for i in range(n_classes):
if costs[i] == 0 and (best == -1 or scores[i] > scores[best]):
best = i
return best
cdef class Mat:
@staticmethod
cdef inline void mean_row(weight_t* Ex,
const weight_t* mat, int32_t nr_row, int32_t nr_col) nogil:
memset(Ex, 0, sizeof(Ex[0]) * nr_col)
for i in range(nr_row):
VecVec.add_i(Ex, &mat[i * nr_col], 1.0, nr_col)
Vec.mul_i(Ex, 1.0 / nr_row, nr_col)
@staticmethod
cdef inline void var_row(weight_t* Vx,
const weight_t* mat, const weight_t* Ex,
int32_t nr_row, int32_t nr_col, weight_t eps) nogil:
# From https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
if nr_row == 0 or nr_col == 0:
return
cdef weight_t sum_, sum2
for i in range(nr_col):
sum_ = 0.0
sum2 = 0.0
for j in range(nr_row):
x = mat[j * nr_col + i]
sum2 += (x - Ex[i]) ** 2
sum_ += x - Ex[i]
Vx[i] = (sum2 - sum_**2 / nr_row) / nr_row
Vx[i] += eps
cdef class MatVec:
@staticmethod
cdef inline void add_i(weight_t* mat,
const weight_t* vec, weight_t scale, int32_t nr_row, int32_t nr_col) nogil:
cdef int i
for i in range(nr_row):
VecVec.add_i(mat + (i * nr_col),
vec, scale, nr_col)
@staticmethod
cdef inline void mul(weight_t* output,
const weight_t* mat,
const weight_t* vec,
int32_t nr_row, int32_t nr_col) nogil:
memcpy(output, mat, sizeof(output[0]) * nr_row * nr_col)
MatVec.mul_i(output, vec, nr_row, nr_col)
@staticmethod
cdef inline void mul_i(weight_t* mat,
const weight_t* vec,
int32_t nr_row, int32_t nr_col) nogil:
cdef int i, row, col
for i in range(nr_row):
row = i * nr_col
for col in range(nr_col):
mat[row + col] *= vec[col]
@staticmethod
cdef inline void dot(weight_t* output,
const weight_t* mat,
const weight_t* vec,
int32_t nr_row, int32_t nr_col) nogil:
cdef int i, row, col
cdef double zero = 0.0
IF USE_BLAS:
blis.gemv(
blis.NO_TRANSPOSE,
blis.NO_CONJUGATE,
nr_row,
nr_col,
1.0,
<weight_t*>mat, nr_col, 1,
<weight_t*>vec, 1,
1.0,
output, 1
)
ELSE:
for i in range(nr_row):
row = i * nr_col
for col in range(nr_col):
output[i] += mat[row + col] * vec[col]
@staticmethod
cdef inline void batch_dot(weight_t* output,
const weight_t* mat,
const weight_t* vec,
int32_t nr_row, int32_t nr_col, int32_t nr_batch) nogil:
# Output dim: batch_size * nr_row
# vec dim: batch_size * nr_col
# mat dim: nr_row * nr_col
# batch_size must be M, because can't transpose C
# so nr_row must be N
# so nr_col must be K
# vec: M * K
# mat.T: K * N
# out: M * N
cdef int i, row, col
cdef double one = 1.0
IF USE_BLAS:
blis.gemm(
blis.NO_TRANSPOSE,
blis.TRANSPOSE,
nr_batch,
nr_row,
nr_col,
1.0,
<weight_t*>vec,
nr_col,
1,
<weight_t*>mat,
nr_col,
1,
1.0,
output,
nr_row,
1)
ELSE:
for b in range(nr_batch):
MatVec.dot(output,
mat, vec, nr_row, nr_col)
output += nr_row
vec += nr_col
@staticmethod
cdef inline void T_dot(weight_t* output,
const weight_t* mat,
const weight_t* vec,
int32_t nr_row,
int32_t nr_col) nogil:
cdef int i, row, col
cdef double zero = 0.0
cdef double one = 1.0
IF USE_BLAS:
blis.gemv(
blis.TRANSPOSE,
blis.NO_CONJUGATE,
nr_row, nr_col,
1.0,
<weight_t*>mat, nr_col, 1,
<weight_t*>vec, 1,
1.0,
output, 1,
)
ELSE:
for row in range(nr_row):
for col in range(nr_col):
output[col] += vec[row] * mat[(row * nr_col) + col]
@staticmethod
cdef inline void batch_T_dot(weight_t* output,
const weight_t* mat,
const weight_t* vec,
int32_t nr_row,
int32_t nr_col,
int32_t nr_batch) nogil:
cdef int _
cdef double one = 1.0
IF USE_BLAS:
# output is (nr_batch, nr_col)
# mat is (nr_row, nr_col)
# vec is (nr_batch, nr_row)
# Output defined as (M, N)
# So
# nr_batch = M
# nr_col = N
# nr_row = K
#
# vec: M * K
# mat: K * N
# out: M * N
blis.gemm(
blis.NO_TRANSPOSE,
blis.NO_TRANSPOSE,
nr_batch,
nr_col,
nr_row,
1.0,
<weight_t*>vec,
nr_row,
1,
<weight_t*>mat,
nr_col,
1,
1.0,
output,
nr_col,
1)
ELSE:
for _ in range(nr_batch):
MatVec.T_dot(output,
mat, vec, nr_row, nr_col)
output += nr_col
vec += nr_row
cdef class MatMat:
@staticmethod
cdef inline void add(weight_t* output,
const weight_t* x,
const weight_t* y,
int32_t nr_row, int32_t nr_col) nogil:
memcpy(output, x, sizeof(output[0]) * nr_row * nr_col)
MatMat.add_i(output, y, nr_row, nr_col)
@staticmethod
cdef inline void add_i(weight_t* x,
const weight_t* y,
int32_t nr_row, int32_t nr_col) nogil:
cdef int i, row, col
for i in range(nr_row):
row = i * nr_col
for col in range(nr_col):
x[row + col] += y[row + col]
@staticmethod
cdef inline void mul(weight_t* output,
const weight_t* x,
const weight_t* y,
int32_t nr_row, int32_t nr_col) nogil:
memcpy(output, x, sizeof(output[0]) * nr_row * nr_col)
MatMat.mul_i(output, y, nr_row, nr_col)
@staticmethod
cdef inline void mul_i(weight_t* x,
const weight_t* y,
int32_t nr_row, int32_t nr_col) nogil:
cdef int i, row, col
for i in range(nr_row):
row = i * nr_col
for col in range(nr_col):
x[row + col] *= y[row + col]
@staticmethod
cdef inline void add_outer_i(weight_t* mat,
const weight_t* x,
const weight_t* y,
int32_t nr_row,
int32_t nr_col) nogil:
cdef int i, j, row
cdef double one = 1.0
IF USE_BLAS:
blis.ger(
blis.NO_CONJUGATE, blis.NO_CONJUGATE,
nr_row, nr_col,
1.0,
<weight_t*>x, 1,
<weight_t*>y, 1,
mat, nr_col, 1
)
ELSE:
for i in range(nr_row):
row = i * nr_col
for j in range(nr_col):
mat[row + j] += x[i] * y[j]
@staticmethod
cdef inline void batch_add_outer_i(weight_t* output,
const weight_t* x,
const weight_t* y,
int32_t nr_row,
int32_t nr_col,
int32_t nr_batch) nogil:
# Output dim: nr_row * nr_col
# x dim: batch_size * nr_row
# y dim: batch_size * nr_col
#
# Output is M*N (can't transpose)
# nr_row = M
# nr_col = N
# batch_size = K
# x.T: M * K
# y: K * N
# out: M * N
cdef double one = 1.0
IF USE_BLAS:
blis.gemm(
blis.TRANSPOSE,
blis.NO_TRANSPOSE,
nr_row,
nr_col,
nr_batch,
1.0,
<weight_t*>x,
nr_row,
1,
<weight_t*>y,
nr_col,
1,
1.0,
output,
nr_col,
1)
ELSE:
for _ in range(nr_batch):
for i in range(nr_row):
row = i * nr_col
for j in range(nr_col):
output[row + j] += x[i] * y[j]
x += nr_row
y += nr_col