809 lines
29 KiB
Python
809 lines
29 KiB
Python
""" Principal Component Analysis
|
|
"""
|
|
|
|
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
|
# Olivier Grisel <olivier.grisel@ensta.org>
|
|
# Mathieu Blondel <mathieu@mblondel.org>
|
|
# Denis A. Engemann <denis-alexander.engemann@inria.fr>
|
|
# Michael Eickenberg <michael.eickenberg@inria.fr>
|
|
# Giorgio Patrini <giorgio.patrini@anu.edu.au>
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
from math import log, sqrt
|
|
|
|
import numpy as np
|
|
from scipy import linalg
|
|
from scipy.special import gammaln
|
|
from scipy.sparse import issparse
|
|
from scipy.sparse.linalg import svds
|
|
|
|
from ..externals import six
|
|
|
|
from .base import _BasePCA
|
|
from ..base import BaseEstimator, TransformerMixin
|
|
from ..utils import deprecated
|
|
from ..utils import check_random_state, as_float_array
|
|
from ..utils import check_array
|
|
from ..utils.extmath import fast_logdet, randomized_svd, svd_flip
|
|
from ..utils.extmath import stable_cumsum
|
|
from ..utils.validation import check_is_fitted
|
|
|
|
|
|
def _assess_dimension_(spectrum, rank, n_samples, n_features):
|
|
"""Compute the likelihood of a rank ``rank`` dataset
|
|
|
|
The dataset is assumed to be embedded in gaussian noise of shape(n,
|
|
dimf) having spectrum ``spectrum``.
|
|
|
|
Parameters
|
|
----------
|
|
spectrum : array of shape (n)
|
|
Data spectrum.
|
|
rank : int
|
|
Tested rank value.
|
|
n_samples : int
|
|
Number of samples.
|
|
n_features : int
|
|
Number of features.
|
|
|
|
Returns
|
|
-------
|
|
ll : float,
|
|
The log-likelihood
|
|
|
|
Notes
|
|
-----
|
|
This implements the method of `Thomas P. Minka:
|
|
Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604`
|
|
"""
|
|
if rank > len(spectrum):
|
|
raise ValueError("The tested rank cannot exceed the rank of the"
|
|
" dataset")
|
|
|
|
pu = -rank * log(2.)
|
|
for i in range(rank):
|
|
pu += (gammaln((n_features - i) / 2.) -
|
|
log(np.pi) * (n_features - i) / 2.)
|
|
|
|
pl = np.sum(np.log(spectrum[:rank]))
|
|
pl = -pl * n_samples / 2.
|
|
|
|
if rank == n_features:
|
|
pv = 0
|
|
v = 1
|
|
else:
|
|
v = np.sum(spectrum[rank:]) / (n_features - rank)
|
|
pv = -np.log(v) * n_samples * (n_features - rank) / 2.
|
|
|
|
m = n_features * rank - rank * (rank + 1.) / 2.
|
|
pp = log(2. * np.pi) * (m + rank + 1.) / 2.
|
|
|
|
pa = 0.
|
|
spectrum_ = spectrum.copy()
|
|
spectrum_[rank:n_features] = v
|
|
for i in range(rank):
|
|
for j in range(i + 1, len(spectrum)):
|
|
pa += log((spectrum[i] - spectrum[j]) *
|
|
(1. / spectrum_[j] - 1. / spectrum_[i])) + log(n_samples)
|
|
|
|
ll = pu + pl + pv + pp - pa / 2. - rank * log(n_samples) / 2.
|
|
|
|
return ll
|
|
|
|
|
|
def _infer_dimension_(spectrum, n_samples, n_features):
|
|
"""Infers the dimension of a dataset of shape (n_samples, n_features)
|
|
|
|
The dataset is described by its spectrum `spectrum`.
|
|
"""
|
|
n_spectrum = len(spectrum)
|
|
ll = np.empty(n_spectrum)
|
|
for rank in range(n_spectrum):
|
|
ll[rank] = _assess_dimension_(spectrum, rank, n_samples, n_features)
|
|
return ll.argmax()
|
|
|
|
|
|
class PCA(_BasePCA):
|
|
"""Principal component analysis (PCA)
|
|
|
|
Linear dimensionality reduction using Singular Value Decomposition of the
|
|
data to project it to a lower dimensional space.
|
|
|
|
It uses the LAPACK implementation of the full SVD or a randomized truncated
|
|
SVD by the method of Halko et al. 2009, depending on the shape of the input
|
|
data and the number of components to extract.
|
|
|
|
It can also use the scipy.sparse.linalg ARPACK implementation of the
|
|
truncated SVD.
|
|
|
|
Notice that this class does not support sparse input. See
|
|
:class:`TruncatedSVD` for an alternative with sparse data.
|
|
|
|
Read more in the :ref:`User Guide <PCA>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_components : int, float, None or string
|
|
Number of components to keep.
|
|
if n_components is not set all components are kept::
|
|
|
|
n_components == min(n_samples, n_features)
|
|
|
|
if n_components == 'mle' and svd_solver == 'full', Minka\'s MLE is used
|
|
to guess the dimension
|
|
if ``0 < n_components < 1`` and svd_solver == 'full', select the number
|
|
of components such that the amount of variance that needs to be
|
|
explained is greater than the percentage specified by n_components
|
|
n_components cannot be equal to n_features for svd_solver == 'arpack'.
|
|
|
|
copy : bool (default True)
|
|
If False, data passed to fit are overwritten and running
|
|
fit(X).transform(X) will not yield the expected results,
|
|
use fit_transform(X) instead.
|
|
|
|
whiten : bool, optional (default False)
|
|
When True (False by default) the `components_` vectors are multiplied
|
|
by the square root of n_samples and then divided by the singular values
|
|
to ensure uncorrelated outputs with unit component-wise variances.
|
|
|
|
Whitening will remove some information from the transformed signal
|
|
(the relative variance scales of the components) but can sometime
|
|
improve the predictive accuracy of the downstream estimators by
|
|
making their data respect some hard-wired assumptions.
|
|
|
|
svd_solver : string {'auto', 'full', 'arpack', 'randomized'}
|
|
auto :
|
|
the solver is selected by a default policy based on `X.shape` and
|
|
`n_components`: if the input data is larger than 500x500 and the
|
|
number of components to extract is lower than 80% of the smallest
|
|
dimension of the data, then the more efficient 'randomized'
|
|
method is enabled. Otherwise the exact full SVD is computed and
|
|
optionally truncated afterwards.
|
|
full :
|
|
run exact full SVD calling the standard LAPACK solver via
|
|
`scipy.linalg.svd` and select the components by postprocessing
|
|
arpack :
|
|
run SVD truncated to n_components calling ARPACK solver via
|
|
`scipy.sparse.linalg.svds`. It requires strictly
|
|
0 < n_components < X.shape[1]
|
|
randomized :
|
|
run randomized SVD by the method of Halko et al.
|
|
|
|
.. versionadded:: 0.18.0
|
|
|
|
tol : float >= 0, optional (default .0)
|
|
Tolerance for singular values computed by svd_solver == 'arpack'.
|
|
|
|
.. versionadded:: 0.18.0
|
|
|
|
iterated_power : int >= 0, or 'auto', (default 'auto')
|
|
Number of iterations for the power method computed by
|
|
svd_solver == 'randomized'.
|
|
|
|
.. versionadded:: 0.18.0
|
|
|
|
random_state : int, RandomState instance or None, optional (default None)
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`. Used when ``svd_solver`` == 'arpack' or 'randomized'.
|
|
|
|
.. versionadded:: 0.18.0
|
|
|
|
Attributes
|
|
----------
|
|
components_ : array, shape (n_components, n_features)
|
|
Principal axes in feature space, representing the directions of
|
|
maximum variance in the data. The components are sorted by
|
|
``explained_variance_``.
|
|
|
|
explained_variance_ : array, shape (n_components,)
|
|
The amount of variance explained by each of the selected components.
|
|
|
|
Equal to n_components largest eigenvalues
|
|
of the covariance matrix of X.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
explained_variance_ratio_ : array, shape (n_components,)
|
|
Percentage of variance explained by each of the selected components.
|
|
|
|
If ``n_components`` is not set then all components are stored and the
|
|
sum of explained variances is equal to 1.0.
|
|
|
|
singular_values_ : array, shape (n_components,)
|
|
The singular values corresponding to each of the selected components.
|
|
The singular values are equal to the 2-norms of the ``n_components``
|
|
variables in the lower-dimensional space.
|
|
|
|
mean_ : array, shape (n_features,)
|
|
Per-feature empirical mean, estimated from the training set.
|
|
|
|
Equal to `X.mean(axis=0)`.
|
|
|
|
n_components_ : int
|
|
The estimated number of components. When n_components is set
|
|
to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this
|
|
number is estimated from input data. Otherwise it equals the parameter
|
|
n_components, or n_features if n_components is None.
|
|
|
|
noise_variance_ : float
|
|
The estimated noise covariance following the Probabilistic PCA model
|
|
from Tipping and Bishop 1999. See "Pattern Recognition and
|
|
Machine Learning" by C. Bishop, 12.2.1 p. 574 or
|
|
http://www.miketipping.com/papers/met-mppca.pdf. It is required to
|
|
computed the estimated data covariance and score samples.
|
|
|
|
Equal to the average of (min(n_features, n_samples) - n_components)
|
|
smallest eigenvalues of the covariance matrix of X.
|
|
|
|
References
|
|
----------
|
|
For n_components == 'mle', this class uses the method of `Thomas P. Minka:
|
|
Automatic Choice of Dimensionality for PCA. NIPS 2000: 598-604`
|
|
|
|
Implements the probabilistic PCA model from:
|
|
M. Tipping and C. Bishop, Probabilistic Principal Component Analysis,
|
|
Journal of the Royal Statistical Society, Series B, 61, Part 3, pp. 611-622
|
|
via the score and score_samples methods.
|
|
See http://www.miketipping.com/papers/met-mppca.pdf
|
|
|
|
For svd_solver == 'arpack', refer to `scipy.sparse.linalg.svds`.
|
|
|
|
For svd_solver == 'randomized', see:
|
|
`Finding structure with randomness: Stochastic algorithms
|
|
for constructing approximate matrix decompositions Halko, et al., 2009
|
|
(arXiv:909)`
|
|
`A randomized algorithm for the decomposition of matrices
|
|
Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert`
|
|
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> from sklearn.decomposition import PCA
|
|
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
|
|
>>> pca = PCA(n_components=2)
|
|
>>> pca.fit(X)
|
|
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
|
|
svd_solver='auto', tol=0.0, whiten=False)
|
|
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
|
|
[ 0.99244... 0.00755...]
|
|
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
|
|
[ 6.30061... 0.54980...]
|
|
|
|
>>> pca = PCA(n_components=2, svd_solver='full')
|
|
>>> pca.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
|
|
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
|
|
svd_solver='full', tol=0.0, whiten=False)
|
|
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
|
|
[ 0.99244... 0.00755...]
|
|
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
|
|
[ 6.30061... 0.54980...]
|
|
|
|
>>> pca = PCA(n_components=1, svd_solver='arpack')
|
|
>>> pca.fit(X)
|
|
PCA(copy=True, iterated_power='auto', n_components=1, random_state=None,
|
|
svd_solver='arpack', tol=0.0, whiten=False)
|
|
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
|
|
[ 0.99244...]
|
|
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
|
|
[ 6.30061...]
|
|
|
|
See also
|
|
--------
|
|
KernelPCA
|
|
SparsePCA
|
|
TruncatedSVD
|
|
IncrementalPCA
|
|
"""
|
|
|
|
def __init__(self, n_components=None, copy=True, whiten=False,
|
|
svd_solver='auto', tol=0.0, iterated_power='auto',
|
|
random_state=None):
|
|
self.n_components = n_components
|
|
self.copy = copy
|
|
self.whiten = whiten
|
|
self.svd_solver = svd_solver
|
|
self.tol = tol
|
|
self.iterated_power = iterated_power
|
|
self.random_state = random_state
|
|
|
|
def fit(self, X, y=None):
|
|
"""Fit the model with X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data, where n_samples in the number of samples
|
|
and n_features is the number of features.
|
|
|
|
y : Ignored.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
Returns the instance itself.
|
|
"""
|
|
self._fit(X)
|
|
return self
|
|
|
|
def fit_transform(self, X, y=None):
|
|
"""Fit the model with X and apply the dimensionality reduction on X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data, where n_samples is the number of samples
|
|
and n_features is the number of features.
|
|
|
|
y : Ignored.
|
|
|
|
Returns
|
|
-------
|
|
X_new : array-like, shape (n_samples, n_components)
|
|
|
|
"""
|
|
U, S, V = self._fit(X)
|
|
U = U[:, :self.n_components_]
|
|
|
|
if self.whiten:
|
|
# X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples)
|
|
U *= sqrt(X.shape[0] - 1)
|
|
else:
|
|
# X_new = X * V = U * S * V^T * V = U * S
|
|
U *= S[:self.n_components_]
|
|
|
|
return U
|
|
|
|
def _fit(self, X):
|
|
"""Dispatch to the right submethod depending on the chosen solver."""
|
|
|
|
# Raise an error for sparse input.
|
|
# This is more informative than the generic one raised by check_array.
|
|
if issparse(X):
|
|
raise TypeError('PCA does not support sparse input. See '
|
|
'TruncatedSVD for a possible alternative.')
|
|
|
|
X = check_array(X, dtype=[np.float64, np.float32], ensure_2d=True,
|
|
copy=self.copy)
|
|
|
|
# Handle n_components==None
|
|
if self.n_components is None:
|
|
n_components = X.shape[1]
|
|
else:
|
|
n_components = self.n_components
|
|
|
|
# Handle svd_solver
|
|
svd_solver = self.svd_solver
|
|
if svd_solver == 'auto':
|
|
# Small problem, just call full PCA
|
|
if max(X.shape) <= 500:
|
|
svd_solver = 'full'
|
|
elif n_components >= 1 and n_components < .8 * min(X.shape):
|
|
svd_solver = 'randomized'
|
|
# This is also the case of n_components in (0,1)
|
|
else:
|
|
svd_solver = 'full'
|
|
|
|
# Call different fits for either full or truncated SVD
|
|
if svd_solver == 'full':
|
|
return self._fit_full(X, n_components)
|
|
elif svd_solver in ['arpack', 'randomized']:
|
|
return self._fit_truncated(X, n_components, svd_solver)
|
|
else:
|
|
raise ValueError("Unrecognized svd_solver='{0}'"
|
|
"".format(svd_solver))
|
|
|
|
def _fit_full(self, X, n_components):
|
|
"""Fit the model by computing full SVD on X"""
|
|
n_samples, n_features = X.shape
|
|
|
|
if n_components == 'mle':
|
|
if n_samples < n_features:
|
|
raise ValueError("n_components='mle' is only supported "
|
|
"if n_samples >= n_features")
|
|
elif not 0 <= n_components <= n_features:
|
|
raise ValueError("n_components=%r must be between 0 and "
|
|
"n_features=%r with svd_solver='full'"
|
|
% (n_components, n_features))
|
|
|
|
# Center data
|
|
self.mean_ = np.mean(X, axis=0)
|
|
X -= self.mean_
|
|
|
|
U, S, V = linalg.svd(X, full_matrices=False)
|
|
# flip eigenvectors' sign to enforce deterministic output
|
|
U, V = svd_flip(U, V)
|
|
|
|
components_ = V
|
|
|
|
# Get variance explained by singular values
|
|
explained_variance_ = (S ** 2) / (n_samples - 1)
|
|
total_var = explained_variance_.sum()
|
|
explained_variance_ratio_ = explained_variance_ / total_var
|
|
singular_values_ = S.copy() # Store the singular values.
|
|
|
|
# Postprocess the number of components required
|
|
if n_components == 'mle':
|
|
n_components = \
|
|
_infer_dimension_(explained_variance_, n_samples, n_features)
|
|
elif 0 < n_components < 1.0:
|
|
# number of components for which the cumulated explained
|
|
# variance percentage is superior to the desired threshold
|
|
ratio_cumsum = stable_cumsum(explained_variance_ratio_)
|
|
n_components = np.searchsorted(ratio_cumsum, n_components) + 1
|
|
|
|
# Compute noise covariance using Probabilistic PCA model
|
|
# The sigma2 maximum likelihood (cf. eq. 12.46)
|
|
if n_components < min(n_features, n_samples):
|
|
self.noise_variance_ = explained_variance_[n_components:].mean()
|
|
else:
|
|
self.noise_variance_ = 0.
|
|
|
|
self.n_samples_, self.n_features_ = n_samples, n_features
|
|
self.components_ = components_[:n_components]
|
|
self.n_components_ = n_components
|
|
self.explained_variance_ = explained_variance_[:n_components]
|
|
self.explained_variance_ratio_ = \
|
|
explained_variance_ratio_[:n_components]
|
|
self.singular_values_ = singular_values_[:n_components]
|
|
|
|
return U, S, V
|
|
|
|
def _fit_truncated(self, X, n_components, svd_solver):
|
|
"""Fit the model by computing truncated SVD (by ARPACK or randomized)
|
|
on X
|
|
"""
|
|
n_samples, n_features = X.shape
|
|
|
|
if isinstance(n_components, six.string_types):
|
|
raise ValueError("n_components=%r cannot be a string "
|
|
"with svd_solver='%s'"
|
|
% (n_components, svd_solver))
|
|
elif not 1 <= n_components <= n_features:
|
|
raise ValueError("n_components=%r must be between 1 and "
|
|
"n_features=%r with svd_solver='%s'"
|
|
% (n_components, n_features, svd_solver))
|
|
elif svd_solver == 'arpack' and n_components == n_features:
|
|
raise ValueError("n_components=%r must be stricly less than "
|
|
"n_features=%r with svd_solver='%s'"
|
|
% (n_components, n_features, svd_solver))
|
|
|
|
random_state = check_random_state(self.random_state)
|
|
|
|
# Center data
|
|
self.mean_ = np.mean(X, axis=0)
|
|
X -= self.mean_
|
|
|
|
if svd_solver == 'arpack':
|
|
# random init solution, as ARPACK does it internally
|
|
v0 = random_state.uniform(-1, 1, size=min(X.shape))
|
|
U, S, V = svds(X, k=n_components, tol=self.tol, v0=v0)
|
|
# svds doesn't abide by scipy.linalg.svd/randomized_svd
|
|
# conventions, so reverse its outputs.
|
|
S = S[::-1]
|
|
# flip eigenvectors' sign to enforce deterministic output
|
|
U, V = svd_flip(U[:, ::-1], V[::-1])
|
|
|
|
elif svd_solver == 'randomized':
|
|
# sign flipping is done inside
|
|
U, S, V = randomized_svd(X, n_components=n_components,
|
|
n_iter=self.iterated_power,
|
|
flip_sign=True,
|
|
random_state=random_state)
|
|
|
|
self.n_samples_, self.n_features_ = n_samples, n_features
|
|
self.components_ = V
|
|
self.n_components_ = n_components
|
|
|
|
# Get variance explained by singular values
|
|
self.explained_variance_ = (S ** 2) / (n_samples - 1)
|
|
total_var = np.var(X, ddof=1, axis=0)
|
|
self.explained_variance_ratio_ = \
|
|
self.explained_variance_ / total_var.sum()
|
|
self.singular_values_ = S.copy() # Store the singular values.
|
|
if self.n_components_ < min(n_features, n_samples):
|
|
self.noise_variance_ = (total_var.sum() -
|
|
self.explained_variance_.sum())
|
|
self.noise_variance_ /= min(n_features, n_samples) - n_components
|
|
else:
|
|
self.noise_variance_ = 0.
|
|
|
|
return U, S, V
|
|
|
|
def score_samples(self, X):
|
|
"""Return the log-likelihood of each sample.
|
|
|
|
See. "Pattern Recognition and Machine Learning"
|
|
by C. Bishop, 12.2.1 p. 574
|
|
or http://www.miketipping.com/papers/met-mppca.pdf
|
|
|
|
Parameters
|
|
----------
|
|
X : array, shape(n_samples, n_features)
|
|
The data.
|
|
|
|
Returns
|
|
-------
|
|
ll : array, shape (n_samples,)
|
|
Log-likelihood of each sample under the current model
|
|
"""
|
|
check_is_fitted(self, 'mean_')
|
|
|
|
X = check_array(X)
|
|
Xr = X - self.mean_
|
|
n_features = X.shape[1]
|
|
log_like = np.zeros(X.shape[0])
|
|
precision = self.get_precision()
|
|
log_like = -.5 * (Xr * (np.dot(Xr, precision))).sum(axis=1)
|
|
log_like -= .5 * (n_features * log(2. * np.pi) -
|
|
fast_logdet(precision))
|
|
return log_like
|
|
|
|
def score(self, X, y=None):
|
|
"""Return the average log-likelihood of all samples.
|
|
|
|
See. "Pattern Recognition and Machine Learning"
|
|
by C. Bishop, 12.2.1 p. 574
|
|
or http://www.miketipping.com/papers/met-mppca.pdf
|
|
|
|
Parameters
|
|
----------
|
|
X : array, shape(n_samples, n_features)
|
|
The data.
|
|
|
|
y : Ignored.
|
|
|
|
Returns
|
|
-------
|
|
ll : float
|
|
Average log-likelihood of the samples under the current model
|
|
"""
|
|
return np.mean(self.score_samples(X))
|
|
|
|
|
|
@deprecated("RandomizedPCA was deprecated in 0.18 and will be removed in "
|
|
"0.20. "
|
|
"Use PCA(svd_solver='randomized') instead. The new implementation "
|
|
"DOES NOT store whiten ``components_``. Apply transform to get "
|
|
"them.")
|
|
class RandomizedPCA(BaseEstimator, TransformerMixin):
|
|
"""Principal component analysis (PCA) using randomized SVD
|
|
|
|
.. deprecated:: 0.18
|
|
This class will be removed in 0.20.
|
|
Use :class:`PCA` with parameter svd_solver 'randomized' instead.
|
|
The new implementation DOES NOT store whiten ``components_``.
|
|
Apply transform to get them.
|
|
|
|
Linear dimensionality reduction using approximated Singular Value
|
|
Decomposition of the data and keeping only the most significant
|
|
singular vectors to project the data to a lower dimensional space.
|
|
|
|
Read more in the :ref:`User Guide <RandomizedPCA>`.
|
|
|
|
Parameters
|
|
----------
|
|
n_components : int, optional
|
|
Maximum number of components to keep. When not given or None, this
|
|
is set to n_features (the second dimension of the training data).
|
|
|
|
copy : bool
|
|
If False, data passed to fit are overwritten and running
|
|
fit(X).transform(X) will not yield the expected results,
|
|
use fit_transform(X) instead.
|
|
|
|
iterated_power : int, default=2
|
|
Number of iterations for the power method.
|
|
|
|
.. versionchanged:: 0.18
|
|
|
|
whiten : bool, optional
|
|
When True (False by default) the `components_` vectors are multiplied
|
|
by the square root of (n_samples) and divided by the singular values to
|
|
ensure uncorrelated outputs with unit component-wise variances.
|
|
|
|
Whitening will remove some information from the transformed signal
|
|
(the relative variance scales of the components) but can sometime
|
|
improve the predictive accuracy of the downstream estimators by
|
|
making their data respect some hard-wired assumptions.
|
|
|
|
random_state : int, RandomState instance or None, optional, default=None
|
|
If int, random_state is the seed used by the random number generator;
|
|
If RandomState instance, random_state is the random number generator;
|
|
If None, the random number generator is the RandomState instance used
|
|
by `np.random`.
|
|
|
|
Attributes
|
|
----------
|
|
components_ : array, shape (n_components, n_features)
|
|
Components with maximum variance.
|
|
|
|
explained_variance_ratio_ : array, shape (n_components,)
|
|
Percentage of variance explained by each of the selected components.
|
|
If k is not set then all components are stored and the sum of explained
|
|
variances is equal to 1.0.
|
|
|
|
singular_values_ : array, shape (n_components,)
|
|
The singular values corresponding to each of the selected components.
|
|
The singular values are equal to the 2-norms of the ``n_components``
|
|
variables in the lower-dimensional space.
|
|
|
|
mean_ : array, shape (n_features,)
|
|
Per-feature empirical mean, estimated from the training set.
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> from sklearn.decomposition import RandomizedPCA
|
|
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
|
|
>>> pca = RandomizedPCA(n_components=2)
|
|
>>> pca.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
|
|
RandomizedPCA(copy=True, iterated_power=2, n_components=2,
|
|
random_state=None, whiten=False)
|
|
>>> print(pca.explained_variance_ratio_) # doctest: +ELLIPSIS
|
|
[ 0.99244... 0.00755...]
|
|
>>> print(pca.singular_values_) # doctest: +ELLIPSIS
|
|
[ 6.30061... 0.54980...]
|
|
|
|
See also
|
|
--------
|
|
PCA
|
|
TruncatedSVD
|
|
|
|
References
|
|
----------
|
|
|
|
.. [Halko2009] `Finding structure with randomness: Stochastic algorithms
|
|
for constructing approximate matrix decompositions Halko, et al., 2009
|
|
(arXiv:909)`
|
|
|
|
.. [MRT] `A randomized algorithm for the decomposition of matrices
|
|
Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert`
|
|
|
|
"""
|
|
|
|
def __init__(self, n_components=None, copy=True, iterated_power=2,
|
|
whiten=False, random_state=None):
|
|
self.n_components = n_components
|
|
self.copy = copy
|
|
self.iterated_power = iterated_power
|
|
self.whiten = whiten
|
|
self.random_state = random_state
|
|
|
|
def fit(self, X, y=None):
|
|
"""Fit the model with X by extracting the first principal components.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training data, where n_samples in the number of samples
|
|
and n_features is the number of features.
|
|
|
|
y : Ignored.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
Returns the instance itself.
|
|
"""
|
|
self._fit(check_array(X))
|
|
return self
|
|
|
|
def _fit(self, X):
|
|
"""Fit the model to the data X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training vector, where n_samples in the number of samples and
|
|
n_features is the number of features.
|
|
|
|
Returns
|
|
-------
|
|
X : ndarray, shape (n_samples, n_features)
|
|
The input data, copied, centered and whitened when requested.
|
|
"""
|
|
random_state = check_random_state(self.random_state)
|
|
X = np.atleast_2d(as_float_array(X, copy=self.copy))
|
|
|
|
n_samples = X.shape[0]
|
|
|
|
# Center data
|
|
self.mean_ = np.mean(X, axis=0)
|
|
X -= self.mean_
|
|
if self.n_components is None:
|
|
n_components = X.shape[1]
|
|
else:
|
|
n_components = self.n_components
|
|
|
|
U, S, V = randomized_svd(X, n_components,
|
|
n_iter=self.iterated_power,
|
|
random_state=random_state)
|
|
|
|
self.explained_variance_ = exp_var = (S ** 2) / (n_samples - 1)
|
|
full_var = np.var(X, ddof=1, axis=0).sum()
|
|
self.explained_variance_ratio_ = exp_var / full_var
|
|
self.singular_values_ = S # Store the singular values.
|
|
|
|
if self.whiten:
|
|
self.components_ = V / S[:, np.newaxis] * sqrt(n_samples)
|
|
else:
|
|
self.components_ = V
|
|
|
|
return X
|
|
|
|
def transform(self, X):
|
|
"""Apply dimensionality reduction on X.
|
|
|
|
X is projected on the first principal components previous extracted
|
|
from a training set.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
New data, where n_samples in the number of samples
|
|
and n_features is the number of features.
|
|
|
|
Returns
|
|
-------
|
|
X_new : array-like, shape (n_samples, n_components)
|
|
|
|
"""
|
|
check_is_fitted(self, 'mean_')
|
|
|
|
X = check_array(X)
|
|
if self.mean_ is not None:
|
|
X = X - self.mean_
|
|
|
|
X = np.dot(X, self.components_.T)
|
|
return X
|
|
|
|
def fit_transform(self, X, y=None):
|
|
"""Fit the model with X and apply the dimensionality reduction on X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
New data, where n_samples in the number of samples
|
|
and n_features is the number of features.
|
|
|
|
y : Ignored.
|
|
|
|
Returns
|
|
-------
|
|
X_new : array-like, shape (n_samples, n_components)
|
|
|
|
"""
|
|
X = check_array(X)
|
|
X = self._fit(X)
|
|
return np.dot(X, self.components_.T)
|
|
|
|
def inverse_transform(self, X):
|
|
"""Transform data back to its original space.
|
|
|
|
Returns an array X_original whose transform would be X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_components)
|
|
New data, where n_samples in the number of samples
|
|
and n_components is the number of components.
|
|
|
|
Returns
|
|
-------
|
|
X_original array-like, shape (n_samples, n_features)
|
|
|
|
Notes
|
|
-----
|
|
If whitening is enabled, inverse_transform does not compute the
|
|
exact inverse operation of transform.
|
|
"""
|
|
check_is_fitted(self, 'mean_')
|
|
|
|
X_original = np.dot(X, self.components_)
|
|
if self.mean_ is not None:
|
|
X_original = X_original + self.mean_
|
|
return X_original
|