laywerrobot/lib/python3.6/site-packages/gensim/_matutils.pyx

360 lines
8.8 KiB
Cython
Raw Normal View History

2020-08-27 21:55:39 +02:00
#!/usr/bin/env cython
# coding: utf-8
# cython: embedsignature=True
from __future__ import division
cimport cython
import numpy as np
cimport numpy as np
ctypedef cython.floating DTYPE_t
from libc.math cimport log, exp, fabs
from cython.parallel import prange
def mean_absolute_difference(a, b):
"""Mean absolute difference between two arrays, using :func:`~gensim._matutils._mean_absolute_difference`.
Parameters
----------
a : numpy.ndarray
Input 1d array, supports float16, float32 and float64.
b : numpy.ndarray
Input 1d array, supports float16, float32 and float64.
Returns
-------
float
mean(abs(a - b)).
"""
if a.shape != b.shape:
raise ValueError("a and b must have same shape")
if a.dtype == np.float64:
return _mean_absolute_difference[double](a, b)
elif a.dtype == np.float32:
return _mean_absolute_difference[float](a, b)
elif a.dtype == np.float16:
return _mean_absolute_difference[float](a.astype(np.float32), b.astype(np.float32))
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
cdef DTYPE_t _mean_absolute_difference(DTYPE_t[:] a, DTYPE_t[:] b) nogil:
"""Mean absolute difference between two arrays.
Parameters
----------
a : numpy.ndarray
Input 1d array.
b : numpy.ndarray
Input 1d array.
Returns
-------
DTYPE_t
mean(abs(a - b))
"""
cdef DTYPE_t result = 0.0
cdef size_t i
cdef size_t j
cdef size_t I = a.shape[0]
cdef size_t N = I
for i in range(I):
result += fabs(a[i] - b[i])
result /= N
return result
def logsumexp(x):
"""Log of sum of exponentials, using :func:`~gensim._matutils._logsumexp_2d`.
Parameters
----------
x : numpy.ndarray
Input 2d matrix, supports float16, float32 and float64.
Returns
-------
float
log of sum of exponentials of elements in `x`.
Warnings
--------
By performance reasons, doesn't support NaNs or 1d, 3d, etc arrays like :func:`scipy.special.logsumexp`.
"""
if x.dtype == np.float64:
return _logsumexp_2d[double](x)
elif x.dtype == np.float32:
return _logsumexp_2d[float](x)
elif x.dtype == np.float16:
return _logsumexp_2d[float](x.astype(np.float32))
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
cdef DTYPE_t _logsumexp_2d(DTYPE_t[:, :] data) nogil:
"""Log of sum of exponentials.
Parameters
----------
x : numpy.ndarray
Input 2d matrix.
Returns
-------
DTYPE_t
log of sum of exponentials of elements in `data`.
"""
cdef DTYPE_t max_val = data[0, 0]
cdef DTYPE_t result = 0.0
cdef size_t i
cdef size_t j
cdef size_t I = data.shape[0]
cdef size_t J = data.shape[1]
for i in range(I):
for j in range(J):
if data[i, j] > max_val:
max_val = data[i, j]
for i in range(I):
for j in range(J):
result += exp(data[i, j] - max_val)
result = log(result) + max_val
return result
def dirichlet_expectation(alpha):
"""Expected value of log(theta) where theta is drawn from a Dirichlet distribution.
Using :func:`~gensim._matutils.dirichlet_expectation_1d` or :func:`~gensim._matutils.dirichlet_expectation_2d`.
Parameters
----------
alpha : numpy.ndarray
Dirichlet parameter 2d matrix or 1d vector, if 2d - each row is treated as a separate parameter vector,
supports float16, float32 and float64.
Returns
-------
numpy.ndarray
Log of expected values, dimension same as `alpha.ndim`.
"""
if alpha.ndim == 2:
return dirichlet_expectation_2d(alpha)
else:
return dirichlet_expectation_1d(alpha)
def dirichlet_expectation_2d(alpha):
"""Expected value of log(theta) where theta is drawn from a Dirichlet distribution.
Using :func:`~gensim._matutils._dirichlet_expectation_2d`.
Parameters
----------
alpha : numpy.ndarray
Dirichlet parameter 2d matrix, each row is treated as a separate parameter vector,
supports float16, float32 and float64.
Returns
-------
numpy.ndarray
Log of expected values, 2d matrix.
"""
if alpha.dtype == np.float64:
out = np.zeros(alpha.shape, dtype=alpha.dtype)
_dirichlet_expectation_2d[double](alpha, out)
elif alpha.dtype == np.float32:
out = np.zeros(alpha.shape, dtype=alpha.dtype)
_dirichlet_expectation_2d[float](alpha, out)
elif alpha.dtype == np.float16:
out = np.zeros(alpha.shape, dtype=np.float32)
_dirichlet_expectation_2d[float](alpha.astype(np.float32), out)
out = out.astype(np.float16)
return out
def dirichlet_expectation_1d(alpha):
"""Expected value of log(theta) where theta is drawn from a Dirichlet distribution.
Using :func:`~gensim._matutils._dirichlet_expectation_1d`.
Parameters
----------
alpha : numpy.ndarray
Dirichlet parameter 1d vector, supports float16, float32 and float64.
Returns
-------
numpy.ndarray
Log of expected values, 1d vector.
"""
if alpha.dtype == np.float64:
out = np.zeros(alpha.shape, dtype=alpha.dtype)
_dirichlet_expectation_1d[double](alpha, out)
elif alpha.dtype == np.float32:
out = np.zeros(alpha.shape, dtype=alpha.dtype)
_dirichlet_expectation_1d[float](alpha, out)
elif alpha.dtype == np.float16:
out = np.zeros(alpha.shape, dtype=np.float32)
_dirichlet_expectation_1d[float](alpha.astype(np.float32), out)
out = out.astype(np.float16)
return out
@cython.boundscheck(False)
@cython.wraparound(False)
cdef void _dirichlet_expectation_1d(DTYPE_t[:] alpha, DTYPE_t[:] out) nogil:
"""Expected value of log(theta) where theta is drawn from a Dirichlet distribution.
Parameters
----------
alpha : numpy.ndarray
Dirichlet parameter 1d vector.
out : numpy.ndarray
Output array, contains log of expected values.
"""
cdef DTYPE_t sum_alpha = 0.0
cdef DTYPE_t psi_sum_alpha = 0.0
cdef size_t i
cdef size_t I = alpha.shape[0]
for i in range(I):
sum_alpha += alpha[i]
psi_sum_alpha = _digamma(sum_alpha)
for i in range(I):
out[i] = _digamma(alpha[i]) - psi_sum_alpha
@cython.boundscheck(False)
@cython.wraparound(False)
cdef void _dirichlet_expectation_2d(DTYPE_t[:, :] alpha, DTYPE_t[:, :] out) nogil:
"""Expected value of log(theta) where theta is drawn from a Dirichlet distribution.
Parameters
----------
alpha : numpy.ndarray
Dirichlet parameter matrix, each row is treated as a parameter vector for its own Dirichlet.
out : numpy.ndarray
Log of expected values, 2d matrix.
"""
cdef DTYPE_t sum_alpha = 0.0
cdef DTYPE_t psi_sum_alpha = 0.0
cdef size_t i, j
cdef size_t I = alpha.shape[0]
cdef size_t J = alpha.shape[1]
for i in range(I):
sum_alpha = 0.0
for j in range(J):
sum_alpha += alpha[i, j]
psi_sum_alpha = _digamma(sum_alpha)
for j in range(J):
out[i, j] = _digamma(alpha[i, j]) - psi_sum_alpha
def digamma(DTYPE_t x):
"""Digamma function for positive floats, using :func:`~gensim._matutils._digamma`.
Parameters
----------
x : float
Positive value.
Returns
-------
float
Digamma(x).
"""
return _digamma(x)
@cython.cdivision(True)
cdef inline DTYPE_t _digamma(DTYPE_t x,) nogil:
"""Digamma function for positive floats.
Parameters
----------
x : float
Positive value.
Notes
-----
Adapted from:
* Authors:
* Original FORTRAN77 version by Jose Bernardo.
* C version by John Burkardt.
* Reference: Jose Bernardo, Algorithm AS 103: Psi (Digamma) Function,
Applied Statistics, Volume 25, Number 3, 1976, pages 315-317.
* Licensing: This code is distributed under the GNU LGPL license.
Returns
-------
float
Digamma(x).
"""
cdef DTYPE_t c = 8.5;
cdef DTYPE_t euler_mascheroni = 0.57721566490153286060;
cdef DTYPE_t r;
cdef DTYPE_t value;
cdef DTYPE_t x2;
if ( x <= 0.000001 ):
value = - euler_mascheroni - 1.0 / x + 1.6449340668482264365 * x;
return value;
# Reduce to DIGAMA(X + N).
value = 0.0;
x2 = x;
while ( x2 < c ):
value = value - 1.0 / x2;
x2 = x2 + 1.0;
# Use Stirling's (actually de Moivre's) expansion.
r = 1.0 / x2;
value = value + log ( x2 ) - 0.5 * r;
r = r * r;
value = value \
- r * ( 1.0 / 12.0 \
- r * ( 1.0 / 120.0 \
- r * ( 1.0 / 252.0 \
- r * ( 1.0 / 240.0 \
- r * ( 1.0 / 132.0 ) ) ) ) )
return value;