laywerrobot/lib/python3.6/site-packages/sklearn/manifold/isomap.py

222 lines
7.4 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
"""Isomap for manifold learning"""
# Author: Jake Vanderplas -- <vanderplas@astro.washington.edu>
# License: BSD 3 clause (C) 2011
import numpy as np
from ..base import BaseEstimator, TransformerMixin
from ..neighbors import NearestNeighbors, kneighbors_graph
from ..utils import check_array
from ..utils.graph import graph_shortest_path
from ..decomposition import KernelPCA
from ..preprocessing import KernelCenterer
class Isomap(BaseEstimator, TransformerMixin):
"""Isomap Embedding
Non-linear dimensionality reduction through Isometric Mapping
Read more in the :ref:`User Guide <isomap>`.
Parameters
----------
n_neighbors : integer
number of neighbors to consider for each point.
n_components : integer
number of coordinates for the manifold
eigen_solver : ['auto'|'arpack'|'dense']
'auto' : Attempt to choose the most efficient solver
for the given problem.
'arpack' : Use Arnoldi decomposition to find the eigenvalues
and eigenvectors.
'dense' : Use a direct solver (i.e. LAPACK)
for the eigenvalue decomposition.
tol : float
Convergence tolerance passed to arpack or lobpcg.
not used if eigen_solver == 'dense'.
max_iter : integer
Maximum number of iterations for the arpack solver.
not used if eigen_solver == 'dense'.
path_method : string ['auto'|'FW'|'D']
Method to use in finding shortest path.
'auto' : attempt to choose the best algorithm automatically.
'FW' : Floyd-Warshall algorithm.
'D' : Dijkstra's algorithm.
neighbors_algorithm : string ['auto'|'brute'|'kd_tree'|'ball_tree']
Algorithm to use for nearest neighbors search,
passed to neighbors.NearestNeighbors instance.
n_jobs : int, optional (default = 1)
The number of parallel jobs to run.
If ``-1``, then the number of jobs is set to the number of CPU cores.
Attributes
----------
embedding_ : array-like, shape (n_samples, n_components)
Stores the embedding vectors.
kernel_pca_ : object
`KernelPCA` object used to implement the embedding.
training_data_ : array-like, shape (n_samples, n_features)
Stores the training data.
nbrs_ : sklearn.neighbors.NearestNeighbors instance
Stores nearest neighbors instance, including BallTree or KDtree
if applicable.
dist_matrix_ : array-like, shape (n_samples, n_samples)
Stores the geodesic distance matrix of training data.
References
----------
.. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric
framework for nonlinear dimensionality reduction. Science 290 (5500)
"""
def __init__(self, n_neighbors=5, n_components=2, eigen_solver='auto',
tol=0, max_iter=None, path_method='auto',
neighbors_algorithm='auto', n_jobs=1):
self.n_neighbors = n_neighbors
self.n_components = n_components
self.eigen_solver = eigen_solver
self.tol = tol
self.max_iter = max_iter
self.path_method = path_method
self.neighbors_algorithm = neighbors_algorithm
self.n_jobs = n_jobs
def _fit_transform(self, X):
X = check_array(X)
self.nbrs_ = NearestNeighbors(n_neighbors=self.n_neighbors,
algorithm=self.neighbors_algorithm,
n_jobs=self.n_jobs)
self.nbrs_.fit(X)
self.training_data_ = self.nbrs_._fit_X
self.kernel_pca_ = KernelPCA(n_components=self.n_components,
kernel="precomputed",
eigen_solver=self.eigen_solver,
tol=self.tol, max_iter=self.max_iter,
n_jobs=self.n_jobs)
kng = kneighbors_graph(self.nbrs_, self.n_neighbors,
mode='distance', n_jobs=self.n_jobs)
self.dist_matrix_ = graph_shortest_path(kng,
method=self.path_method,
directed=False)
G = self.dist_matrix_ ** 2
G *= -0.5
self.embedding_ = self.kernel_pca_.fit_transform(G)
def reconstruction_error(self):
"""Compute the reconstruction error for the embedding.
Returns
-------
reconstruction_error : float
Notes
-------
The cost function of an isomap embedding is
``E = frobenius_norm[K(D) - K(D_fit)] / n_samples``
Where D is the matrix of distances for the input data X,
D_fit is the matrix of distances for the output embedding X_fit,
and K is the isomap kernel:
``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)``
"""
G = -0.5 * self.dist_matrix_ ** 2
G_center = KernelCenterer().fit_transform(G)
evals = self.kernel_pca_.lambdas_
return np.sqrt(np.sum(G_center ** 2) - np.sum(evals ** 2)) / G.shape[0]
def fit(self, X, y=None):
"""Compute the embedding vectors for data X
Parameters
----------
X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors}
Sample data, shape = (n_samples, n_features), in the form of a
numpy array, precomputed tree, or NearestNeighbors
object.
y: Ignored.
Returns
-------
self : returns an instance of self.
"""
self._fit_transform(X)
return self
def fit_transform(self, X, y=None):
"""Fit the model from data in X and transform X.
Parameters
----------
X : {array-like, sparse matrix, BallTree, KDTree}
Training vector, where n_samples in the number of samples
and n_features is the number of features.
y: Ignored.
Returns
-------
X_new : array-like, shape (n_samples, n_components)
"""
self._fit_transform(X)
return self.embedding_
def transform(self, X):
"""Transform X.
This is implemented by linking the points X into the graph of geodesic
distances of the training data. First the `n_neighbors` nearest
neighbors of X are found in the training data, and from these the
shortest geodesic distances from each point in X to each point in
the training data are computed in order to construct the kernel.
The embedding of X is the projection of this kernel onto the
embedding vectors of the training set.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Returns
-------
X_new : array-like, shape (n_samples, n_components)
"""
X = check_array(X)
distances, indices = self.nbrs_.kneighbors(X, return_distance=True)
# Create the graph of shortest distances from X to self.training_data_
# via the nearest neighbors of X.
# This can be done as a single array operation, but it potentially
# takes a lot of memory. To avoid that, use a loop:
G_X = np.zeros((X.shape[0], self.training_data_.shape[0]))
for i in range(X.shape[0]):
G_X[i] = np.min(self.dist_matrix_[indices[i]] +
distances[i][:, None], 0)
G_X **= 2
G_X *= -0.5
return self.kernel_pca_.transform(G_X)