334 lines
11 KiB
Python
334 lines
11 KiB
Python
|
"""Affinity Propagation clustering algorithm."""
|
||
|
|
||
|
# Author: Alexandre Gramfort alexandre.gramfort@inria.fr
|
||
|
# Gael Varoquaux gael.varoquaux@normalesup.org
|
||
|
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from ..base import BaseEstimator, ClusterMixin
|
||
|
from ..utils import as_float_array, check_array
|
||
|
from ..utils.validation import check_is_fitted
|
||
|
from ..metrics import euclidean_distances
|
||
|
from ..metrics import pairwise_distances_argmin
|
||
|
|
||
|
|
||
|
def affinity_propagation(S, preference=None, convergence_iter=15, max_iter=200,
|
||
|
damping=0.5, copy=True, verbose=False,
|
||
|
return_n_iter=False):
|
||
|
"""Perform Affinity Propagation Clustering of data
|
||
|
|
||
|
Read more in the :ref:`User Guide <affinity_propagation>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
|
||
|
S : array-like, shape (n_samples, n_samples)
|
||
|
Matrix of similarities between points
|
||
|
|
||
|
preference : array-like, shape (n_samples,) or float, optional
|
||
|
Preferences for each point - points with larger values of
|
||
|
preferences are more likely to be chosen as exemplars. The number of
|
||
|
exemplars, i.e. of clusters, is influenced by the input preferences
|
||
|
value. If the preferences are not passed as arguments, they will be
|
||
|
set to the median of the input similarities (resulting in a moderate
|
||
|
number of clusters). For a smaller amount of clusters, this can be set
|
||
|
to the minimum value of the similarities.
|
||
|
|
||
|
convergence_iter : int, optional, default: 15
|
||
|
Number of iterations with no change in the number
|
||
|
of estimated clusters that stops the convergence.
|
||
|
|
||
|
max_iter : int, optional, default: 200
|
||
|
Maximum number of iterations
|
||
|
|
||
|
damping : float, optional, default: 0.5
|
||
|
Damping factor between 0.5 and 1.
|
||
|
|
||
|
copy : boolean, optional, default: True
|
||
|
If copy is False, the affinity matrix is modified inplace by the
|
||
|
algorithm, for memory efficiency
|
||
|
|
||
|
verbose : boolean, optional, default: False
|
||
|
The verbosity level
|
||
|
|
||
|
return_n_iter : bool, default False
|
||
|
Whether or not to return the number of iterations.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
|
||
|
cluster_centers_indices : array, shape (n_clusters,)
|
||
|
index of clusters centers
|
||
|
|
||
|
labels : array, shape (n_samples,)
|
||
|
cluster labels for each point
|
||
|
|
||
|
n_iter : int
|
||
|
number of iterations run. Returned only if `return_n_iter` is
|
||
|
set to True.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
|
||
|
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
|
||
|
Between Data Points", Science Feb. 2007
|
||
|
"""
|
||
|
S = as_float_array(S, copy=copy)
|
||
|
n_samples = S.shape[0]
|
||
|
|
||
|
if S.shape[0] != S.shape[1]:
|
||
|
raise ValueError("S must be a square array (shape=%s)" % repr(S.shape))
|
||
|
|
||
|
if preference is None:
|
||
|
preference = np.median(S)
|
||
|
if damping < 0.5 or damping >= 1:
|
||
|
raise ValueError('damping must be >= 0.5 and < 1')
|
||
|
|
||
|
random_state = np.random.RandomState(0)
|
||
|
|
||
|
# Place preference on the diagonal of S
|
||
|
S.flat[::(n_samples + 1)] = preference
|
||
|
|
||
|
A = np.zeros((n_samples, n_samples))
|
||
|
R = np.zeros((n_samples, n_samples)) # Initialize messages
|
||
|
# Intermediate results
|
||
|
tmp = np.zeros((n_samples, n_samples))
|
||
|
|
||
|
# Remove degeneracies
|
||
|
S += ((np.finfo(np.double).eps * S + np.finfo(np.double).tiny * 100) *
|
||
|
random_state.randn(n_samples, n_samples))
|
||
|
|
||
|
# Execute parallel affinity propagation updates
|
||
|
e = np.zeros((n_samples, convergence_iter))
|
||
|
|
||
|
ind = np.arange(n_samples)
|
||
|
|
||
|
for it in range(max_iter):
|
||
|
# tmp = A + S; compute responsibilities
|
||
|
np.add(A, S, tmp)
|
||
|
I = np.argmax(tmp, axis=1)
|
||
|
Y = tmp[ind, I] # np.max(A + S, axis=1)
|
||
|
tmp[ind, I] = -np.inf
|
||
|
Y2 = np.max(tmp, axis=1)
|
||
|
|
||
|
# tmp = Rnew
|
||
|
np.subtract(S, Y[:, None], tmp)
|
||
|
tmp[ind, I] = S[ind, I] - Y2
|
||
|
|
||
|
# Damping
|
||
|
tmp *= 1 - damping
|
||
|
R *= damping
|
||
|
R += tmp
|
||
|
|
||
|
# tmp = Rp; compute availabilities
|
||
|
np.maximum(R, 0, tmp)
|
||
|
tmp.flat[::n_samples + 1] = R.flat[::n_samples + 1]
|
||
|
|
||
|
# tmp = -Anew
|
||
|
tmp -= np.sum(tmp, axis=0)
|
||
|
dA = np.diag(tmp).copy()
|
||
|
tmp.clip(0, np.inf, tmp)
|
||
|
tmp.flat[::n_samples + 1] = dA
|
||
|
|
||
|
# Damping
|
||
|
tmp *= 1 - damping
|
||
|
A *= damping
|
||
|
A -= tmp
|
||
|
|
||
|
# Check for convergence
|
||
|
E = (np.diag(A) + np.diag(R)) > 0
|
||
|
e[:, it % convergence_iter] = E
|
||
|
K = np.sum(E, axis=0)
|
||
|
|
||
|
if it >= convergence_iter:
|
||
|
se = np.sum(e, axis=1)
|
||
|
unconverged = (np.sum((se == convergence_iter) + (se == 0))
|
||
|
!= n_samples)
|
||
|
if (not unconverged and (K > 0)) or (it == max_iter):
|
||
|
if verbose:
|
||
|
print("Converged after %d iterations." % it)
|
||
|
break
|
||
|
else:
|
||
|
if verbose:
|
||
|
print("Did not converge")
|
||
|
|
||
|
I = np.where(np.diag(A + R) > 0)[0]
|
||
|
K = I.size # Identify exemplars
|
||
|
|
||
|
if K > 0:
|
||
|
c = np.argmax(S[:, I], axis=1)
|
||
|
c[I] = np.arange(K) # Identify clusters
|
||
|
# Refine the final set of exemplars and clusters and return results
|
||
|
for k in range(K):
|
||
|
ii = np.where(c == k)[0]
|
||
|
j = np.argmax(np.sum(S[ii[:, np.newaxis], ii], axis=0))
|
||
|
I[k] = ii[j]
|
||
|
|
||
|
c = np.argmax(S[:, I], axis=1)
|
||
|
c[I] = np.arange(K)
|
||
|
labels = I[c]
|
||
|
# Reduce labels to a sorted, gapless, list
|
||
|
cluster_centers_indices = np.unique(labels)
|
||
|
labels = np.searchsorted(cluster_centers_indices, labels)
|
||
|
else:
|
||
|
labels = np.empty((n_samples, 1))
|
||
|
cluster_centers_indices = None
|
||
|
labels.fill(np.nan)
|
||
|
|
||
|
if return_n_iter:
|
||
|
return cluster_centers_indices, labels, it + 1
|
||
|
else:
|
||
|
return cluster_centers_indices, labels
|
||
|
|
||
|
|
||
|
###############################################################################
|
||
|
|
||
|
class AffinityPropagation(BaseEstimator, ClusterMixin):
|
||
|
"""Perform Affinity Propagation Clustering of data.
|
||
|
|
||
|
Read more in the :ref:`User Guide <affinity_propagation>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
damping : float, optional, default: 0.5
|
||
|
Damping factor (between 0.5 and 1) is the extent to
|
||
|
which the current value is maintained relative to
|
||
|
incoming values (weighted 1 - damping). This in order
|
||
|
to avoid numerical oscillations when updating these
|
||
|
values (messages).
|
||
|
|
||
|
max_iter : int, optional, default: 200
|
||
|
Maximum number of iterations.
|
||
|
|
||
|
convergence_iter : int, optional, default: 15
|
||
|
Number of iterations with no change in the number
|
||
|
of estimated clusters that stops the convergence.
|
||
|
|
||
|
copy : boolean, optional, default: True
|
||
|
Make a copy of input data.
|
||
|
|
||
|
preference : array-like, shape (n_samples,) or float, optional
|
||
|
Preferences for each point - points with larger values of
|
||
|
preferences are more likely to be chosen as exemplars. The number
|
||
|
of exemplars, ie of clusters, is influenced by the input
|
||
|
preferences value. If the preferences are not passed as arguments,
|
||
|
they will be set to the median of the input similarities.
|
||
|
|
||
|
affinity : string, optional, default=``euclidean``
|
||
|
Which affinity to use. At the moment ``precomputed`` and
|
||
|
``euclidean`` are supported. ``euclidean`` uses the
|
||
|
negative squared euclidean distance between points.
|
||
|
|
||
|
verbose : boolean, optional, default: False
|
||
|
Whether to be verbose.
|
||
|
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
cluster_centers_indices_ : array, shape (n_clusters,)
|
||
|
Indices of cluster centers
|
||
|
|
||
|
cluster_centers_ : array, shape (n_clusters, n_features)
|
||
|
Cluster centers (if affinity != ``precomputed``).
|
||
|
|
||
|
labels_ : array, shape (n_samples,)
|
||
|
Labels of each point
|
||
|
|
||
|
affinity_matrix_ : array, shape (n_samples, n_samples)
|
||
|
Stores the affinity matrix used in ``fit``.
|
||
|
|
||
|
n_iter_ : int
|
||
|
Number of iterations taken to converge.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
For an example, see :ref:`examples/cluster/plot_affinity_propagation.py
|
||
|
<sphx_glr_auto_examples_cluster_plot_affinity_propagation.py>`.
|
||
|
|
||
|
The algorithmic complexity of affinity propagation is quadratic
|
||
|
in the number of points.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
|
||
|
Brendan J. Frey and Delbert Dueck, "Clustering by Passing Messages
|
||
|
Between Data Points", Science Feb. 2007
|
||
|
"""
|
||
|
|
||
|
def __init__(self, damping=.5, max_iter=200, convergence_iter=15,
|
||
|
copy=True, preference=None, affinity='euclidean',
|
||
|
verbose=False):
|
||
|
|
||
|
self.damping = damping
|
||
|
self.max_iter = max_iter
|
||
|
self.convergence_iter = convergence_iter
|
||
|
self.copy = copy
|
||
|
self.verbose = verbose
|
||
|
self.preference = preference
|
||
|
self.affinity = affinity
|
||
|
|
||
|
@property
|
||
|
def _pairwise(self):
|
||
|
return self.affinity == "precomputed"
|
||
|
|
||
|
def fit(self, X, y=None):
|
||
|
""" Create affinity matrix from negative euclidean distances, then
|
||
|
apply affinity propagation clustering.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
|
||
|
X : array-like, shape (n_samples, n_features) or (n_samples, n_samples)
|
||
|
Data matrix or, if affinity is ``precomputed``, matrix of
|
||
|
similarities / affinities.
|
||
|
|
||
|
y : Ignored
|
||
|
|
||
|
"""
|
||
|
X = check_array(X, accept_sparse='csr')
|
||
|
if self.affinity == "precomputed":
|
||
|
self.affinity_matrix_ = X
|
||
|
elif self.affinity == "euclidean":
|
||
|
self.affinity_matrix_ = -euclidean_distances(X, squared=True)
|
||
|
else:
|
||
|
raise ValueError("Affinity must be 'precomputed' or "
|
||
|
"'euclidean'. Got %s instead"
|
||
|
% str(self.affinity))
|
||
|
|
||
|
self.cluster_centers_indices_, self.labels_, self.n_iter_ = \
|
||
|
affinity_propagation(
|
||
|
self.affinity_matrix_, self.preference, max_iter=self.max_iter,
|
||
|
convergence_iter=self.convergence_iter, damping=self.damping,
|
||
|
copy=self.copy, verbose=self.verbose, return_n_iter=True)
|
||
|
|
||
|
if self.affinity != "precomputed":
|
||
|
self.cluster_centers_ = X[self.cluster_centers_indices_].copy()
|
||
|
|
||
|
return self
|
||
|
|
||
|
def predict(self, X):
|
||
|
"""Predict the closest cluster each sample in X belongs to.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix}, shape (n_samples, n_features)
|
||
|
New data to predict.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
labels : array, shape (n_samples,)
|
||
|
Index of the cluster each sample belongs to.
|
||
|
"""
|
||
|
check_is_fitted(self, "cluster_centers_indices_")
|
||
|
if not hasattr(self, "cluster_centers_"):
|
||
|
raise ValueError("Predict method is not supported when "
|
||
|
"affinity='precomputed'.")
|
||
|
|
||
|
return pairwise_distances_argmin(X, self.cluster_centers_)
|