laywerrobot/lib/python3.6/site-packages/gensim/test/test_matutils.py
2020-08-27 21:55:39 +02:00

247 lines
9.3 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
import logging
import unittest
import numpy as np
from scipy import sparse
from scipy.special import psi # gamma function utils
import gensim.matutils as matutils
# we'll define known, good (slow) version of functions here
# and compare results from these functions vs. cython ones
def logsumexp(x):
"""Log of sum of exponentials.
Parameters
----------
x : numpy.ndarray
Input 2d matrix.
Returns
-------
float
log of sum of exponentials of elements in `x`.
Warnings
--------
By performance reasons, doesn't support NaNs or 1d, 3d, etc arrays like :func:`scipy.special.logsumexp`.
"""
x_max = np.max(x)
x = np.log(np.sum(np.exp(x - x_max)))
x += x_max
return x
def mean_absolute_difference(a, b):
"""Mean absolute difference between two arrays.
Parameters
----------
a : numpy.ndarray
Input 1d array.
b : numpy.ndarray
Input 1d array.
Returns
-------
float
mean(abs(a - b)).
"""
return np.mean(np.abs(a - b))
def dirichlet_expectation(alpha):
"""For a vector :math:`\\theta \sim Dir(\\alpha)`, compute :math:`E[log \\theta]`.
Parameters
----------
alpha : numpy.ndarray
Dirichlet parameter 2d matrix or 1d vector, if 2d - each row is treated as a separate parameter vector.
Returns
-------
numpy.ndarray:
:math:`E[log \\theta]`
"""
if len(alpha.shape) == 1:
result = psi(alpha) - psi(np.sum(alpha))
else:
result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]
return result.astype(alpha.dtype, copy=False) # keep the same precision as input
dirichlet_expectation_1d = dirichlet_expectation
dirichlet_expectation_2d = dirichlet_expectation
class TestLdaModelInner(unittest.TestCase):
def setUp(self):
self.random_state = np.random.RandomState()
self.num_runs = 100 # test functions with *num_runs* random inputs
self.num_topics = 100
def testLogSumExp(self):
# test logsumexp
rs = self.random_state
for dtype in [np.float16, np.float32, np.float64]:
for i in range(self.num_runs):
input = rs.uniform(-1000, 1000, size=(self.num_topics, 1))
known_good = logsumexp(input)
test_values = matutils.logsumexp(input)
msg = "logsumexp failed for dtype={}".format(dtype)
self.assertTrue(np.allclose(known_good, test_values), msg)
def testMeanAbsoluteDifference(self):
# test mean_absolute_difference
rs = self.random_state
for dtype in [np.float16, np.float32, np.float64]:
for i in range(self.num_runs):
input1 = rs.uniform(-10000, 10000, size=(self.num_topics,))
input2 = rs.uniform(-10000, 10000, size=(self.num_topics,))
known_good = mean_absolute_difference(input1, input2)
test_values = matutils.mean_absolute_difference(input1, input2)
msg = "mean_absolute_difference failed for dtype={}".format(dtype)
self.assertTrue(np.allclose(known_good, test_values), msg)
def testDirichletExpectation(self):
# test dirichlet_expectation
rs = self.random_state
for dtype in [np.float16, np.float32, np.float64]:
for i in range(self.num_runs):
# 1 dimensional case
input_1d = rs.uniform(.01, 10000, size=(self.num_topics,))
known_good = dirichlet_expectation(input_1d)
test_values = matutils.dirichlet_expectation(input_1d)
msg = "dirichlet_expectation_1d failed for dtype={}".format(dtype)
self.assertTrue(np.allclose(known_good, test_values), msg)
# 2 dimensional case
input_2d = rs.uniform(.01, 10000, size=(1, self.num_topics,))
known_good = dirichlet_expectation(input_2d)
test_values = matutils.dirichlet_expectation(input_2d)
msg = "dirichlet_expectation_2d failed for dtype={}".format(dtype)
self.assertTrue(np.allclose(known_good, test_values), msg)
def manual_unitvec(vec):
# manual unit vector calculation for UnitvecTestCase
vec = vec.astype(np.float)
if sparse.issparse(vec):
vec_sum_of_squares = vec.multiply(vec)
unit = 1. / np.sqrt(vec_sum_of_squares.sum())
return vec.multiply(unit)
elif not sparse.issparse(vec):
sum_vec_squared = np.sum(vec ** 2)
vec /= np.sqrt(sum_vec_squared)
return vec
class UnitvecTestCase(unittest.TestCase):
# test unitvec
def test_sparse_npfloat32(self):
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.float32)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
self.assertEqual(input_vector.dtype, unit_vector.dtype)
def test_sparse_npfloat64(self):
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.float64)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
self.assertEqual(input_vector.dtype, unit_vector.dtype)
def test_sparse_npint32(self):
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.int32)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
self.assertTrue(np.issubdtype(unit_vector.dtype, float))
def test_sparse_npint64(self):
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(np.int64)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
self.assertTrue(np.issubdtype(unit_vector.dtype, float))
def test_dense_npfloat32(self):
input_vector = np.random.uniform(size=(5,)).astype(np.float32)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector, man_unit_vector))
self.assertEqual(input_vector.dtype, unit_vector.dtype)
def test_dense_npfloat64(self):
input_vector = np.random.uniform(size=(5,)).astype(np.float64)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector, man_unit_vector))
self.assertEqual(input_vector.dtype, unit_vector.dtype)
def test_dense_npint32(self):
input_vector = np.random.randint(10, size=5).astype(np.int32)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector, man_unit_vector))
self.assertTrue(np.issubdtype(unit_vector.dtype, float))
def test_dense_npint64(self):
input_vector = np.random.randint(10, size=5).astype(np.int32)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector, man_unit_vector))
self.assertTrue(np.issubdtype(unit_vector.dtype, float))
def test_sparse_python_float(self):
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(float)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
self.assertEqual(input_vector.dtype, unit_vector.dtype)
def test_sparse_python_int(self):
input_vector = sparse.csr_matrix(np.asarray([[1, 0, 0, 0, 3], [0, 0, 4, 3, 0]])).astype(int)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector.data, man_unit_vector.data, atol=1e-3))
self.assertTrue(np.issubdtype(unit_vector.dtype, float))
def test_dense_python_float(self):
input_vector = np.random.uniform(size=(5,)).astype(float)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector, man_unit_vector))
self.assertEqual(input_vector.dtype, unit_vector.dtype)
def test_dense_python_int(self):
input_vector = np.random.randint(10, size=5).astype(int)
unit_vector = matutils.unitvec(input_vector)
man_unit_vector = manual_unitvec(input_vector)
self.assertTrue(np.allclose(unit_vector, man_unit_vector))
self.assertTrue(np.issubdtype(unit_vector.dtype, float))
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()