689 lines
24 KiB
Python
689 lines
24 KiB
Python
import numpy as np
|
|
import scipy as sp
|
|
from itertools import product
|
|
|
|
from sklearn.utils.testing import assert_almost_equal
|
|
from sklearn.utils.testing import assert_array_almost_equal
|
|
from sklearn.utils.testing import assert_true
|
|
from sklearn.utils.testing import assert_equal
|
|
from sklearn.utils.testing import assert_greater
|
|
from sklearn.utils.testing import assert_raises
|
|
from sklearn.utils.testing import assert_no_warnings
|
|
from sklearn.utils.testing import assert_warns_message
|
|
from sklearn.utils.testing import ignore_warnings
|
|
from sklearn.utils.testing import assert_less
|
|
|
|
from sklearn import datasets
|
|
from sklearn.decomposition import PCA
|
|
from sklearn.decomposition import RandomizedPCA
|
|
from sklearn.decomposition.pca import _assess_dimension_
|
|
from sklearn.decomposition.pca import _infer_dimension_
|
|
|
|
iris = datasets.load_iris()
|
|
solver_list = ['full', 'arpack', 'randomized', 'auto']
|
|
|
|
|
|
def test_pca():
|
|
# PCA on dense arrays
|
|
X = iris.data
|
|
|
|
for n_comp in np.arange(X.shape[1]):
|
|
pca = PCA(n_components=n_comp, svd_solver='full')
|
|
|
|
X_r = pca.fit(X).transform(X)
|
|
np.testing.assert_equal(X_r.shape[1], n_comp)
|
|
|
|
X_r2 = pca.fit_transform(X)
|
|
assert_array_almost_equal(X_r, X_r2)
|
|
|
|
X_r = pca.transform(X)
|
|
X_r2 = pca.fit_transform(X)
|
|
assert_array_almost_equal(X_r, X_r2)
|
|
|
|
# Test get_covariance and get_precision
|
|
cov = pca.get_covariance()
|
|
precision = pca.get_precision()
|
|
assert_array_almost_equal(np.dot(cov, precision),
|
|
np.eye(X.shape[1]), 12)
|
|
|
|
# test explained_variance_ratio_ == 1 with all components
|
|
pca = PCA(svd_solver='full')
|
|
pca.fit(X)
|
|
assert_almost_equal(pca.explained_variance_ratio_.sum(), 1.0, 3)
|
|
|
|
|
|
def test_pca_arpack_solver():
|
|
# PCA on dense arrays
|
|
X = iris.data
|
|
d = X.shape[1]
|
|
|
|
# Loop excluding the extremes, invalid inputs for arpack
|
|
for n_comp in np.arange(1, d):
|
|
pca = PCA(n_components=n_comp, svd_solver='arpack', random_state=0)
|
|
|
|
X_r = pca.fit(X).transform(X)
|
|
np.testing.assert_equal(X_r.shape[1], n_comp)
|
|
|
|
X_r2 = pca.fit_transform(X)
|
|
assert_array_almost_equal(X_r, X_r2)
|
|
|
|
X_r = pca.transform(X)
|
|
assert_array_almost_equal(X_r, X_r2)
|
|
|
|
# Test get_covariance and get_precision
|
|
cov = pca.get_covariance()
|
|
precision = pca.get_precision()
|
|
assert_array_almost_equal(np.dot(cov, precision),
|
|
np.eye(d), 12)
|
|
|
|
pca = PCA(n_components=0, svd_solver='arpack', random_state=0)
|
|
assert_raises(ValueError, pca.fit, X)
|
|
# Check internal state
|
|
assert_equal(pca.n_components,
|
|
PCA(n_components=0,
|
|
svd_solver='arpack', random_state=0).n_components)
|
|
assert_equal(pca.svd_solver,
|
|
PCA(n_components=0,
|
|
svd_solver='arpack', random_state=0).svd_solver)
|
|
|
|
pca = PCA(n_components=d, svd_solver='arpack', random_state=0)
|
|
assert_raises(ValueError, pca.fit, X)
|
|
assert_equal(pca.n_components,
|
|
PCA(n_components=d,
|
|
svd_solver='arpack', random_state=0).n_components)
|
|
assert_equal(pca.svd_solver,
|
|
PCA(n_components=0,
|
|
svd_solver='arpack', random_state=0).svd_solver)
|
|
|
|
|
|
def test_pca_randomized_solver():
|
|
# PCA on dense arrays
|
|
X = iris.data
|
|
|
|
# Loop excluding the 0, invalid for randomized
|
|
for n_comp in np.arange(1, X.shape[1]):
|
|
pca = PCA(n_components=n_comp, svd_solver='randomized', random_state=0)
|
|
|
|
X_r = pca.fit(X).transform(X)
|
|
np.testing.assert_equal(X_r.shape[1], n_comp)
|
|
|
|
X_r2 = pca.fit_transform(X)
|
|
assert_array_almost_equal(X_r, X_r2)
|
|
|
|
X_r = pca.transform(X)
|
|
assert_array_almost_equal(X_r, X_r2)
|
|
|
|
# Test get_covariance and get_precision
|
|
cov = pca.get_covariance()
|
|
precision = pca.get_precision()
|
|
assert_array_almost_equal(np.dot(cov, precision),
|
|
np.eye(X.shape[1]), 12)
|
|
|
|
pca = PCA(n_components=0, svd_solver='randomized', random_state=0)
|
|
assert_raises(ValueError, pca.fit, X)
|
|
|
|
pca = PCA(n_components=0, svd_solver='randomized', random_state=0)
|
|
assert_raises(ValueError, pca.fit, X)
|
|
# Check internal state
|
|
assert_equal(pca.n_components,
|
|
PCA(n_components=0,
|
|
svd_solver='randomized', random_state=0).n_components)
|
|
assert_equal(pca.svd_solver,
|
|
PCA(n_components=0,
|
|
svd_solver='randomized', random_state=0).svd_solver)
|
|
|
|
|
|
def test_no_empty_slice_warning():
|
|
# test if we avoid numpy warnings for computing over empty arrays
|
|
n_components = 10
|
|
n_features = n_components + 2 # anything > n_comps triggered it in 0.16
|
|
X = np.random.uniform(-1, 1, size=(n_components, n_features))
|
|
pca = PCA(n_components=n_components)
|
|
assert_no_warnings(pca.fit, X)
|
|
|
|
|
|
def test_whitening():
|
|
# Check that PCA output has unit-variance
|
|
rng = np.random.RandomState(0)
|
|
n_samples = 100
|
|
n_features = 80
|
|
n_components = 30
|
|
rank = 50
|
|
|
|
# some low rank data with correlated features
|
|
X = np.dot(rng.randn(n_samples, rank),
|
|
np.dot(np.diag(np.linspace(10.0, 1.0, rank)),
|
|
rng.randn(rank, n_features)))
|
|
# the component-wise variance of the first 50 features is 3 times the
|
|
# mean component-wise variance of the remaining 30 features
|
|
X[:, :50] *= 3
|
|
|
|
assert_equal(X.shape, (n_samples, n_features))
|
|
|
|
# the component-wise variance is thus highly varying:
|
|
assert_greater(X.std(axis=0).std(), 43.8)
|
|
|
|
for solver, copy in product(solver_list, (True, False)):
|
|
# whiten the data while projecting to the lower dim subspace
|
|
X_ = X.copy() # make sure we keep an original across iterations.
|
|
pca = PCA(n_components=n_components, whiten=True, copy=copy,
|
|
svd_solver=solver, random_state=0, iterated_power=7)
|
|
# test fit_transform
|
|
X_whitened = pca.fit_transform(X_.copy())
|
|
assert_equal(X_whitened.shape, (n_samples, n_components))
|
|
X_whitened2 = pca.transform(X_)
|
|
assert_array_almost_equal(X_whitened, X_whitened2)
|
|
|
|
assert_almost_equal(X_whitened.std(ddof=1, axis=0),
|
|
np.ones(n_components),
|
|
decimal=6)
|
|
assert_almost_equal(X_whitened.mean(axis=0), np.zeros(n_components))
|
|
|
|
X_ = X.copy()
|
|
pca = PCA(n_components=n_components, whiten=False, copy=copy,
|
|
svd_solver=solver).fit(X_)
|
|
X_unwhitened = pca.transform(X_)
|
|
assert_equal(X_unwhitened.shape, (n_samples, n_components))
|
|
|
|
# in that case the output components still have varying variances
|
|
assert_almost_equal(X_unwhitened.std(axis=0).std(), 74.1, 1)
|
|
# we always center, so no test for non-centering.
|
|
|
|
|
|
# Ignore warnings from switching to more power iterations in randomized_svd
|
|
@ignore_warnings
|
|
def test_explained_variance():
|
|
# Check that PCA output has unit-variance
|
|
rng = np.random.RandomState(0)
|
|
n_samples = 100
|
|
n_features = 80
|
|
|
|
X = rng.randn(n_samples, n_features)
|
|
|
|
pca = PCA(n_components=2, svd_solver='full').fit(X)
|
|
apca = PCA(n_components=2, svd_solver='arpack', random_state=0).fit(X)
|
|
assert_array_almost_equal(pca.explained_variance_,
|
|
apca.explained_variance_, 1)
|
|
assert_array_almost_equal(pca.explained_variance_ratio_,
|
|
apca.explained_variance_ratio_, 3)
|
|
|
|
rpca = PCA(n_components=2, svd_solver='randomized', random_state=42).fit(X)
|
|
assert_array_almost_equal(pca.explained_variance_,
|
|
rpca.explained_variance_, 1)
|
|
assert_array_almost_equal(pca.explained_variance_ratio_,
|
|
rpca.explained_variance_ratio_, 1)
|
|
|
|
# compare to empirical variances
|
|
expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0]
|
|
expected_result = sorted(expected_result, reverse=True)[:2]
|
|
|
|
X_pca = pca.transform(X)
|
|
assert_array_almost_equal(pca.explained_variance_,
|
|
np.var(X_pca, ddof=1, axis=0))
|
|
assert_array_almost_equal(pca.explained_variance_, expected_result)
|
|
|
|
X_pca = apca.transform(X)
|
|
assert_array_almost_equal(apca.explained_variance_,
|
|
np.var(X_pca, ddof=1, axis=0))
|
|
assert_array_almost_equal(apca.explained_variance_, expected_result)
|
|
|
|
X_rpca = rpca.transform(X)
|
|
assert_array_almost_equal(rpca.explained_variance_,
|
|
np.var(X_rpca, ddof=1, axis=0),
|
|
decimal=1)
|
|
assert_array_almost_equal(rpca.explained_variance_,
|
|
expected_result, decimal=1)
|
|
|
|
# Same with correlated data
|
|
X = datasets.make_classification(n_samples, n_features,
|
|
n_informative=n_features-2,
|
|
random_state=rng)[0]
|
|
|
|
pca = PCA(n_components=2).fit(X)
|
|
rpca = PCA(n_components=2, svd_solver='randomized',
|
|
random_state=rng).fit(X)
|
|
assert_array_almost_equal(pca.explained_variance_ratio_,
|
|
rpca.explained_variance_ratio_, 5)
|
|
|
|
|
|
def test_singular_values():
|
|
# Check that the PCA output has the correct singular values
|
|
|
|
rng = np.random.RandomState(0)
|
|
n_samples = 100
|
|
n_features = 80
|
|
|
|
X = rng.randn(n_samples, n_features)
|
|
|
|
pca = PCA(n_components=2, svd_solver='full',
|
|
random_state=rng).fit(X)
|
|
apca = PCA(n_components=2, svd_solver='arpack',
|
|
random_state=rng).fit(X)
|
|
rpca = PCA(n_components=2, svd_solver='randomized',
|
|
random_state=rng).fit(X)
|
|
assert_array_almost_equal(pca.singular_values_, apca.singular_values_, 12)
|
|
assert_array_almost_equal(pca.singular_values_, rpca.singular_values_, 1)
|
|
assert_array_almost_equal(apca.singular_values_, rpca.singular_values_, 1)
|
|
|
|
# Compare to the Frobenius norm
|
|
X_pca = pca.transform(X)
|
|
X_apca = apca.transform(X)
|
|
X_rpca = rpca.transform(X)
|
|
assert_array_almost_equal(np.sum(pca.singular_values_**2.0),
|
|
np.linalg.norm(X_pca, "fro")**2.0, 12)
|
|
assert_array_almost_equal(np.sum(apca.singular_values_**2.0),
|
|
np.linalg.norm(X_apca, "fro")**2.0, 9)
|
|
assert_array_almost_equal(np.sum(rpca.singular_values_**2.0),
|
|
np.linalg.norm(X_rpca, "fro")**2.0, 0)
|
|
|
|
# Compare to the 2-norms of the score vectors
|
|
assert_array_almost_equal(pca.singular_values_,
|
|
np.sqrt(np.sum(X_pca**2.0, axis=0)), 12)
|
|
assert_array_almost_equal(apca.singular_values_,
|
|
np.sqrt(np.sum(X_apca**2.0, axis=0)), 12)
|
|
assert_array_almost_equal(rpca.singular_values_,
|
|
np.sqrt(np.sum(X_rpca**2.0, axis=0)), 2)
|
|
|
|
# Set the singular values and see what we get back
|
|
rng = np.random.RandomState(0)
|
|
n_samples = 100
|
|
n_features = 110
|
|
|
|
X = rng.randn(n_samples, n_features)
|
|
|
|
pca = PCA(n_components=3, svd_solver='full', random_state=rng)
|
|
apca = PCA(n_components=3, svd_solver='arpack', random_state=rng)
|
|
rpca = PCA(n_components=3, svd_solver='randomized', random_state=rng)
|
|
X_pca = pca.fit_transform(X)
|
|
|
|
X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0))
|
|
X_pca[:, 0] *= 3.142
|
|
X_pca[:, 1] *= 2.718
|
|
|
|
X_hat = np.dot(X_pca, pca.components_)
|
|
pca.fit(X_hat)
|
|
apca.fit(X_hat)
|
|
rpca.fit(X_hat)
|
|
assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14)
|
|
assert_array_almost_equal(apca.singular_values_, [3.142, 2.718, 1.0], 14)
|
|
assert_array_almost_equal(rpca.singular_values_, [3.142, 2.718, 1.0], 14)
|
|
|
|
|
|
def test_pca_check_projection():
|
|
# Test that the projection of data is correct
|
|
rng = np.random.RandomState(0)
|
|
n, p = 100, 3
|
|
X = rng.randn(n, p) * .1
|
|
X[:10] += np.array([3, 4, 5])
|
|
Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5])
|
|
|
|
for solver in solver_list:
|
|
Yt = PCA(n_components=2, svd_solver=solver).fit(X).transform(Xt)
|
|
Yt /= np.sqrt((Yt ** 2).sum())
|
|
|
|
assert_almost_equal(np.abs(Yt[0][0]), 1., 1)
|
|
|
|
|
|
def test_pca_inverse():
|
|
# Test that the projection of data can be inverted
|
|
rng = np.random.RandomState(0)
|
|
n, p = 50, 3
|
|
X = rng.randn(n, p) # spherical data
|
|
X[:, 1] *= .00001 # make middle component relatively small
|
|
X += [5, 4, 3] # make a large mean
|
|
|
|
# same check that we can find the original data from the transformed
|
|
# signal (since the data is almost of rank n_components)
|
|
pca = PCA(n_components=2, svd_solver='full').fit(X)
|
|
Y = pca.transform(X)
|
|
Y_inverse = pca.inverse_transform(Y)
|
|
assert_almost_equal(X, Y_inverse, decimal=3)
|
|
|
|
# same as above with whitening (approximate reconstruction)
|
|
for solver in solver_list:
|
|
pca = PCA(n_components=2, whiten=True, svd_solver=solver)
|
|
pca.fit(X)
|
|
Y = pca.transform(X)
|
|
Y_inverse = pca.inverse_transform(Y)
|
|
assert_almost_equal(X, Y_inverse, decimal=3)
|
|
|
|
|
|
def test_pca_validation():
|
|
X = [[0, 1], [1, 0]]
|
|
for solver in solver_list:
|
|
for n_components in [-1, 3]:
|
|
assert_raises(ValueError,
|
|
PCA(n_components, svd_solver=solver).fit, X)
|
|
|
|
|
|
def test_randomized_pca_check_projection():
|
|
# Test that the projection by randomized PCA on dense data is correct
|
|
rng = np.random.RandomState(0)
|
|
n, p = 100, 3
|
|
X = rng.randn(n, p) * .1
|
|
X[:10] += np.array([3, 4, 5])
|
|
Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5])
|
|
|
|
Yt = PCA(n_components=2, svd_solver='randomized',
|
|
random_state=0).fit(X).transform(Xt)
|
|
Yt /= np.sqrt((Yt ** 2).sum())
|
|
|
|
assert_almost_equal(np.abs(Yt[0][0]), 1., 1)
|
|
|
|
|
|
def test_randomized_pca_check_list():
|
|
# Test that the projection by randomized PCA on list data is correct
|
|
X = [[1.0, 0.0], [0.0, 1.0]]
|
|
X_transformed = PCA(n_components=1, svd_solver='randomized',
|
|
random_state=0).fit(X).transform(X)
|
|
assert_equal(X_transformed.shape, (2, 1))
|
|
assert_almost_equal(X_transformed.mean(), 0.00, 2)
|
|
assert_almost_equal(X_transformed.std(), 0.71, 2)
|
|
|
|
|
|
def test_randomized_pca_inverse():
|
|
# Test that randomized PCA is inversible on dense data
|
|
rng = np.random.RandomState(0)
|
|
n, p = 50, 3
|
|
X = rng.randn(n, p) # spherical data
|
|
X[:, 1] *= .00001 # make middle component relatively small
|
|
X += [5, 4, 3] # make a large mean
|
|
|
|
# same check that we can find the original data from the transformed signal
|
|
# (since the data is almost of rank n_components)
|
|
pca = PCA(n_components=2, svd_solver='randomized', random_state=0).fit(X)
|
|
Y = pca.transform(X)
|
|
Y_inverse = pca.inverse_transform(Y)
|
|
assert_almost_equal(X, Y_inverse, decimal=2)
|
|
|
|
# same as above with whitening (approximate reconstruction)
|
|
pca = PCA(n_components=2, whiten=True, svd_solver='randomized',
|
|
random_state=0).fit(X)
|
|
Y = pca.transform(X)
|
|
Y_inverse = pca.inverse_transform(Y)
|
|
relative_max_delta = (np.abs(X - Y_inverse) / np.abs(X).mean()).max()
|
|
assert_less(relative_max_delta, 1e-5)
|
|
|
|
|
|
def test_pca_dim():
|
|
# Check automated dimensionality setting
|
|
rng = np.random.RandomState(0)
|
|
n, p = 100, 5
|
|
X = rng.randn(n, p) * .1
|
|
X[:10] += np.array([3, 4, 5, 1, 2])
|
|
pca = PCA(n_components='mle', svd_solver='full').fit(X)
|
|
assert_equal(pca.n_components, 'mle')
|
|
assert_equal(pca.n_components_, 1)
|
|
|
|
|
|
def test_infer_dim_1():
|
|
# TODO: explain what this is testing
|
|
# Or at least use explicit variable names...
|
|
n, p = 1000, 5
|
|
rng = np.random.RandomState(0)
|
|
X = (rng.randn(n, p) * .1 + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2]) +
|
|
np.array([1, 0, 7, 4, 6]))
|
|
pca = PCA(n_components=p, svd_solver='full')
|
|
pca.fit(X)
|
|
spect = pca.explained_variance_
|
|
ll = []
|
|
for k in range(p):
|
|
ll.append(_assess_dimension_(spect, k, n, p))
|
|
ll = np.array(ll)
|
|
assert_greater(ll[1], ll.max() - .01 * n)
|
|
|
|
|
|
def test_infer_dim_2():
|
|
# TODO: explain what this is testing
|
|
# Or at least use explicit variable names...
|
|
n, p = 1000, 5
|
|
rng = np.random.RandomState(0)
|
|
X = rng.randn(n, p) * .1
|
|
X[:10] += np.array([3, 4, 5, 1, 2])
|
|
X[10:20] += np.array([6, 0, 7, 2, -1])
|
|
pca = PCA(n_components=p, svd_solver='full')
|
|
pca.fit(X)
|
|
spect = pca.explained_variance_
|
|
assert_greater(_infer_dimension_(spect, n, p), 1)
|
|
|
|
|
|
def test_infer_dim_3():
|
|
n, p = 100, 5
|
|
rng = np.random.RandomState(0)
|
|
X = rng.randn(n, p) * .1
|
|
X[:10] += np.array([3, 4, 5, 1, 2])
|
|
X[10:20] += np.array([6, 0, 7, 2, -1])
|
|
X[30:40] += 2 * np.array([-1, 1, -1, 1, -1])
|
|
pca = PCA(n_components=p, svd_solver='full')
|
|
pca.fit(X)
|
|
spect = pca.explained_variance_
|
|
assert_greater(_infer_dimension_(spect, n, p), 2)
|
|
|
|
|
|
def test_infer_dim_by_explained_variance():
|
|
X = iris.data
|
|
pca = PCA(n_components=0.95, svd_solver='full')
|
|
pca.fit(X)
|
|
assert_equal(pca.n_components, 0.95)
|
|
assert_equal(pca.n_components_, 2)
|
|
|
|
pca = PCA(n_components=0.01, svd_solver='full')
|
|
pca.fit(X)
|
|
assert_equal(pca.n_components, 0.01)
|
|
assert_equal(pca.n_components_, 1)
|
|
|
|
rng = np.random.RandomState(0)
|
|
# more features than samples
|
|
X = rng.rand(5, 20)
|
|
pca = PCA(n_components=.5, svd_solver='full').fit(X)
|
|
assert_equal(pca.n_components, 0.5)
|
|
assert_equal(pca.n_components_, 2)
|
|
|
|
|
|
def test_pca_score():
|
|
# Test that probabilistic PCA scoring yields a reasonable score
|
|
n, p = 1000, 3
|
|
rng = np.random.RandomState(0)
|
|
X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
|
|
for solver in solver_list:
|
|
pca = PCA(n_components=2, svd_solver=solver)
|
|
pca.fit(X)
|
|
ll1 = pca.score(X)
|
|
h = -0.5 * np.log(2 * np.pi * np.exp(1) * 0.1 ** 2) * p
|
|
np.testing.assert_almost_equal(ll1 / h, 1, 0)
|
|
|
|
|
|
def test_pca_score2():
|
|
# Test that probabilistic PCA correctly separated different datasets
|
|
n, p = 100, 3
|
|
rng = np.random.RandomState(0)
|
|
X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
|
|
for solver in solver_list:
|
|
pca = PCA(n_components=2, svd_solver=solver)
|
|
pca.fit(X)
|
|
ll1 = pca.score(X)
|
|
ll2 = pca.score(rng.randn(n, p) * .2 + np.array([3, 4, 5]))
|
|
assert_greater(ll1, ll2)
|
|
|
|
# Test that it gives different scores if whiten=True
|
|
pca = PCA(n_components=2, whiten=True, svd_solver=solver)
|
|
pca.fit(X)
|
|
ll2 = pca.score(X)
|
|
assert_true(ll1 > ll2)
|
|
|
|
|
|
def test_pca_score3():
|
|
# Check that probabilistic PCA selects the right model
|
|
n, p = 200, 3
|
|
rng = np.random.RandomState(0)
|
|
Xl = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) +
|
|
np.array([1, 0, 7]))
|
|
Xt = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) +
|
|
np.array([1, 0, 7]))
|
|
ll = np.zeros(p)
|
|
for k in range(p):
|
|
pca = PCA(n_components=k, svd_solver='full')
|
|
pca.fit(Xl)
|
|
ll[k] = pca.score(Xt)
|
|
|
|
assert_true(ll.argmax() == 1)
|
|
|
|
|
|
def test_pca_score_with_different_solvers():
|
|
digits = datasets.load_digits()
|
|
X_digits = digits.data
|
|
|
|
pca_dict = {svd_solver: PCA(n_components=30, svd_solver=svd_solver,
|
|
random_state=0)
|
|
for svd_solver in solver_list}
|
|
|
|
for pca in pca_dict.values():
|
|
pca.fit(X_digits)
|
|
# Sanity check for the noise_variance_. For more details see
|
|
# https://github.com/scikit-learn/scikit-learn/issues/7568
|
|
# https://github.com/scikit-learn/scikit-learn/issues/8541
|
|
# https://github.com/scikit-learn/scikit-learn/issues/8544
|
|
assert np.all((pca.explained_variance_ - pca.noise_variance_) >= 0)
|
|
|
|
# Compare scores with different svd_solvers
|
|
score_dict = {svd_solver: pca.score(X_digits)
|
|
for svd_solver, pca in pca_dict.items()}
|
|
assert_almost_equal(score_dict['full'], score_dict['arpack'])
|
|
assert_almost_equal(score_dict['full'], score_dict['randomized'],
|
|
decimal=3)
|
|
|
|
|
|
def test_pca_zero_noise_variance_edge_cases():
|
|
# ensure that noise_variance_ is 0 in edge cases
|
|
# when n_components == min(n_samples, n_features)
|
|
n, p = 100, 3
|
|
|
|
rng = np.random.RandomState(0)
|
|
X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
|
|
# arpack raises ValueError for n_components == min(n_samples,
|
|
# n_features)
|
|
svd_solvers = ['full', 'randomized']
|
|
|
|
for svd_solver in svd_solvers:
|
|
pca = PCA(svd_solver=svd_solver, n_components=p)
|
|
pca.fit(X)
|
|
assert pca.noise_variance_ == 0
|
|
|
|
pca.fit(X.T)
|
|
assert pca.noise_variance_ == 0
|
|
|
|
|
|
def test_svd_solver_auto():
|
|
rng = np.random.RandomState(0)
|
|
X = rng.uniform(size=(1000, 50))
|
|
|
|
# case: n_components in (0,1) => 'full'
|
|
pca = PCA(n_components=.5)
|
|
pca.fit(X)
|
|
pca_test = PCA(n_components=.5, svd_solver='full')
|
|
pca_test.fit(X)
|
|
assert_array_almost_equal(pca.components_, pca_test.components_)
|
|
|
|
# case: max(X.shape) <= 500 => 'full'
|
|
pca = PCA(n_components=5, random_state=0)
|
|
Y = X[:10, :]
|
|
pca.fit(Y)
|
|
pca_test = PCA(n_components=5, svd_solver='full', random_state=0)
|
|
pca_test.fit(Y)
|
|
assert_array_almost_equal(pca.components_, pca_test.components_)
|
|
|
|
# case: n_components >= .8 * min(X.shape) => 'full'
|
|
pca = PCA(n_components=50)
|
|
pca.fit(X)
|
|
pca_test = PCA(n_components=50, svd_solver='full')
|
|
pca_test.fit(X)
|
|
assert_array_almost_equal(pca.components_, pca_test.components_)
|
|
|
|
# n_components >= 1 and n_components < .8 * min(X.shape) => 'randomized'
|
|
pca = PCA(n_components=10, random_state=0)
|
|
pca.fit(X)
|
|
pca_test = PCA(n_components=10, svd_solver='randomized', random_state=0)
|
|
pca_test.fit(X)
|
|
assert_array_almost_equal(pca.components_, pca_test.components_)
|
|
|
|
|
|
def test_deprecation_randomized_pca():
|
|
rng = np.random.RandomState(0)
|
|
X = rng.random_sample((5, 4))
|
|
|
|
depr_message = ("Class RandomizedPCA is deprecated; RandomizedPCA was "
|
|
"deprecated in 0.18 and will be "
|
|
"removed in 0.20. Use PCA(svd_solver='randomized') "
|
|
"instead. The new implementation DOES NOT store "
|
|
"whiten ``components_``. Apply transform to get them.")
|
|
|
|
def fit_deprecated(X):
|
|
global Y
|
|
rpca = RandomizedPCA(random_state=0)
|
|
Y = rpca.fit_transform(X)
|
|
|
|
assert_warns_message(DeprecationWarning, depr_message, fit_deprecated, X)
|
|
Y_pca = PCA(svd_solver='randomized', random_state=0).fit_transform(X)
|
|
assert_array_almost_equal(Y, Y_pca)
|
|
|
|
|
|
def test_pca_sparse_input():
|
|
X = np.random.RandomState(0).rand(5, 4)
|
|
X = sp.sparse.csr_matrix(X)
|
|
assert(sp.sparse.issparse(X))
|
|
|
|
for svd_solver in solver_list:
|
|
pca = PCA(n_components=3, svd_solver=svd_solver)
|
|
|
|
assert_raises(TypeError, pca.fit, X)
|
|
|
|
|
|
def test_pca_bad_solver():
|
|
X = np.random.RandomState(0).rand(5, 4)
|
|
pca = PCA(n_components=3, svd_solver='bad_argument')
|
|
assert_raises(ValueError, pca.fit, X)
|
|
|
|
|
|
def test_pca_dtype_preservation():
|
|
for svd_solver in solver_list:
|
|
yield check_pca_float_dtype_preservation, svd_solver
|
|
yield check_pca_int_dtype_upcast_to_double, svd_solver
|
|
|
|
|
|
def check_pca_float_dtype_preservation(svd_solver):
|
|
# Ensure that PCA does not upscale the dtype when input is float32
|
|
X_64 = np.random.RandomState(0).rand(1000, 4).astype(np.float64)
|
|
X_32 = X_64.astype(np.float32)
|
|
|
|
pca_64 = PCA(n_components=3, svd_solver=svd_solver,
|
|
random_state=0).fit(X_64)
|
|
pca_32 = PCA(n_components=3, svd_solver=svd_solver,
|
|
random_state=0).fit(X_32)
|
|
|
|
assert pca_64.components_.dtype == np.float64
|
|
assert pca_32.components_.dtype == np.float32
|
|
assert pca_64.transform(X_64).dtype == np.float64
|
|
assert pca_32.transform(X_32).dtype == np.float32
|
|
|
|
assert_array_almost_equal(pca_64.components_, pca_32.components_,
|
|
decimal=5)
|
|
|
|
|
|
def check_pca_int_dtype_upcast_to_double(svd_solver):
|
|
# Ensure that all int types will be upcast to float64
|
|
X_i64 = np.random.RandomState(0).randint(0, 1000, (1000, 4))
|
|
X_i64 = X_i64.astype(np.int64)
|
|
X_i32 = X_i64.astype(np.int32)
|
|
|
|
pca_64 = PCA(n_components=3, svd_solver=svd_solver,
|
|
random_state=0).fit(X_i64)
|
|
pca_32 = PCA(n_components=3, svd_solver=svd_solver,
|
|
random_state=0).fit(X_i32)
|
|
|
|
assert pca_64.components_.dtype == np.float64
|
|
assert pca_32.components_.dtype == np.float64
|
|
assert pca_64.transform(X_i64).dtype == np.float64
|
|
assert pca_32.transform(X_i32).dtype == np.float64
|
|
|
|
assert_array_almost_equal(pca_64.components_, pca_32.components_,
|
|
decimal=5)
|