171 lines
6.3 KiB
Python
171 lines
6.3 KiB
Python
|
# Author: Vlad Niculae
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
import sys
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from sklearn.utils.testing import assert_array_almost_equal
|
||
|
from sklearn.utils.testing import assert_equal
|
||
|
from sklearn.utils.testing import assert_array_equal
|
||
|
from sklearn.utils.testing import SkipTest
|
||
|
from sklearn.utils.testing import assert_true
|
||
|
from sklearn.utils.testing import assert_false
|
||
|
from sklearn.utils.testing import assert_warns_message
|
||
|
from sklearn.utils.testing import if_safe_multiprocessing_with_blas
|
||
|
|
||
|
from sklearn.decomposition import SparsePCA, MiniBatchSparsePCA
|
||
|
from sklearn.utils import check_random_state
|
||
|
|
||
|
|
||
|
def generate_toy_data(n_components, n_samples, image_size, random_state=None):
|
||
|
n_features = image_size[0] * image_size[1]
|
||
|
|
||
|
rng = check_random_state(random_state)
|
||
|
U = rng.randn(n_samples, n_components)
|
||
|
V = rng.randn(n_components, n_features)
|
||
|
|
||
|
centers = [(3, 3), (6, 7), (8, 1)]
|
||
|
sz = [1, 2, 1]
|
||
|
for k in range(n_components):
|
||
|
img = np.zeros(image_size)
|
||
|
xmin, xmax = centers[k][0] - sz[k], centers[k][0] + sz[k]
|
||
|
ymin, ymax = centers[k][1] - sz[k], centers[k][1] + sz[k]
|
||
|
img[xmin:xmax][:, ymin:ymax] = 1.0
|
||
|
V[k, :] = img.ravel()
|
||
|
|
||
|
# Y is defined by : Y = UV + noise
|
||
|
Y = np.dot(U, V)
|
||
|
Y += 0.1 * rng.randn(Y.shape[0], Y.shape[1]) # Add noise
|
||
|
return Y, U, V
|
||
|
|
||
|
# SparsePCA can be a bit slow. To avoid having test times go up, we
|
||
|
# test different aspects of the code in the same test
|
||
|
|
||
|
|
||
|
def test_correct_shapes():
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.randn(12, 10)
|
||
|
spca = SparsePCA(n_components=8, random_state=rng)
|
||
|
U = spca.fit_transform(X)
|
||
|
assert_equal(spca.components_.shape, (8, 10))
|
||
|
assert_equal(U.shape, (12, 8))
|
||
|
# test overcomplete decomposition
|
||
|
spca = SparsePCA(n_components=13, random_state=rng)
|
||
|
U = spca.fit_transform(X)
|
||
|
assert_equal(spca.components_.shape, (13, 10))
|
||
|
assert_equal(U.shape, (12, 13))
|
||
|
|
||
|
|
||
|
def test_fit_transform():
|
||
|
alpha = 1
|
||
|
rng = np.random.RandomState(0)
|
||
|
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array
|
||
|
spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
|
||
|
random_state=0)
|
||
|
spca_lars.fit(Y)
|
||
|
|
||
|
# Test that CD gives similar results
|
||
|
spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0,
|
||
|
alpha=alpha)
|
||
|
spca_lasso.fit(Y)
|
||
|
assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
|
||
|
|
||
|
# Test that deprecated ridge_alpha parameter throws warning
|
||
|
warning_msg = "The ridge_alpha parameter on transform()"
|
||
|
assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform,
|
||
|
Y, ridge_alpha=0.01)
|
||
|
assert_warns_message(DeprecationWarning, warning_msg, spca_lars.transform,
|
||
|
Y, ridge_alpha=None)
|
||
|
|
||
|
|
||
|
@if_safe_multiprocessing_with_blas
|
||
|
def test_fit_transform_parallel():
|
||
|
alpha = 1
|
||
|
rng = np.random.RandomState(0)
|
||
|
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array
|
||
|
spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
|
||
|
random_state=0)
|
||
|
spca_lars.fit(Y)
|
||
|
U1 = spca_lars.transform(Y)
|
||
|
# Test multiple CPUs
|
||
|
spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha,
|
||
|
random_state=0).fit(Y)
|
||
|
U2 = spca.transform(Y)
|
||
|
assert_true(not np.all(spca_lars.components_ == 0))
|
||
|
assert_array_almost_equal(U1, U2)
|
||
|
|
||
|
|
||
|
def test_transform_nan():
|
||
|
# Test that SparsePCA won't return NaN when there is 0 feature in all
|
||
|
# samples.
|
||
|
rng = np.random.RandomState(0)
|
||
|
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array
|
||
|
Y[:, 0] = 0
|
||
|
estimator = SparsePCA(n_components=8)
|
||
|
assert_false(np.any(np.isnan(estimator.fit_transform(Y))))
|
||
|
|
||
|
|
||
|
def test_fit_transform_tall():
|
||
|
rng = np.random.RandomState(0)
|
||
|
Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng) # tall array
|
||
|
spca_lars = SparsePCA(n_components=3, method='lars',
|
||
|
random_state=rng)
|
||
|
U1 = spca_lars.fit_transform(Y)
|
||
|
spca_lasso = SparsePCA(n_components=3, method='cd', random_state=rng)
|
||
|
U2 = spca_lasso.fit(Y).transform(Y)
|
||
|
assert_array_almost_equal(U1, U2)
|
||
|
|
||
|
|
||
|
def test_initialization():
|
||
|
rng = np.random.RandomState(0)
|
||
|
U_init = rng.randn(5, 3)
|
||
|
V_init = rng.randn(3, 4)
|
||
|
model = SparsePCA(n_components=3, U_init=U_init, V_init=V_init, max_iter=0,
|
||
|
random_state=rng)
|
||
|
model.fit(rng.randn(5, 4))
|
||
|
assert_array_equal(model.components_, V_init)
|
||
|
|
||
|
|
||
|
def test_mini_batch_correct_shapes():
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.randn(12, 10)
|
||
|
pca = MiniBatchSparsePCA(n_components=8, random_state=rng)
|
||
|
U = pca.fit_transform(X)
|
||
|
assert_equal(pca.components_.shape, (8, 10))
|
||
|
assert_equal(U.shape, (12, 8))
|
||
|
# test overcomplete decomposition
|
||
|
pca = MiniBatchSparsePCA(n_components=13, random_state=rng)
|
||
|
U = pca.fit_transform(X)
|
||
|
assert_equal(pca.components_.shape, (13, 10))
|
||
|
assert_equal(U.shape, (12, 13))
|
||
|
|
||
|
|
||
|
def test_mini_batch_fit_transform():
|
||
|
raise SkipTest("skipping mini_batch_fit_transform.")
|
||
|
alpha = 1
|
||
|
rng = np.random.RandomState(0)
|
||
|
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array
|
||
|
spca_lars = MiniBatchSparsePCA(n_components=3, random_state=0,
|
||
|
alpha=alpha).fit(Y)
|
||
|
U1 = spca_lars.transform(Y)
|
||
|
# Test multiple CPUs
|
||
|
if sys.platform == 'win32': # fake parallelism for win32
|
||
|
import sklearn.externals.joblib.parallel as joblib_par
|
||
|
_mp = joblib_par.multiprocessing
|
||
|
joblib_par.multiprocessing = None
|
||
|
try:
|
||
|
U2 = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha,
|
||
|
random_state=0).fit(Y).transform(Y)
|
||
|
finally:
|
||
|
joblib_par.multiprocessing = _mp
|
||
|
else: # we can efficiently use parallelism
|
||
|
U2 = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha,
|
||
|
random_state=0).fit(Y).transform(Y)
|
||
|
assert_true(not np.all(spca_lars.components_ == 0))
|
||
|
assert_array_almost_equal(U1, U2)
|
||
|
# Test that CD gives similar results
|
||
|
spca_lasso = MiniBatchSparsePCA(n_components=3, method='cd', alpha=alpha,
|
||
|
random_state=0).fit(Y)
|
||
|
assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
|