"""Mean shift clustering algorithm. Mean shift clustering aims to discover *blobs* in a smooth density of samples. It is a centroid based algorithm, which works by updating candidates for centroids to be the mean of the points within a given region. These candidates are then filtered in a post-processing stage to eliminate near-duplicates to form the final set of centroids. Seeding is performed using a binning technique for scalability. """ # Authors: Conrad Lee # Alexandre Gramfort # Gael Varoquaux # Martino Sorbaro import numpy as np import warnings from collections import defaultdict from ..externals import six from ..utils.validation import check_is_fitted from ..utils import check_random_state, gen_batches, check_array from ..base import BaseEstimator, ClusterMixin from ..neighbors import NearestNeighbors from ..metrics.pairwise import pairwise_distances_argmin from ..externals.joblib import Parallel from ..externals.joblib import delayed def estimate_bandwidth(X, quantile=0.3, n_samples=None, random_state=0, n_jobs=1): """Estimate the bandwidth to use with the mean-shift algorithm. That this function takes time at least quadratic in n_samples. For large datasets, it's wise to set that parameter to a small value. Parameters ---------- X : array-like, shape=[n_samples, n_features] Input points. quantile : float, default 0.3 should be between [0, 1] 0.5 means that the median of all pairwise distances is used. n_samples : int, optional The number of samples to use. If not given, all samples are used. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. n_jobs : int, optional (default = 1) The number of parallel jobs to run for neighbors search. If ``-1``, then the number of jobs is set to the number of CPU cores. Returns ------- bandwidth : float The bandwidth parameter. """ X = check_array(X) random_state = check_random_state(random_state) if n_samples is not None: idx = random_state.permutation(X.shape[0])[:n_samples] X = X[idx] nbrs = NearestNeighbors(n_neighbors=int(X.shape[0] * quantile), n_jobs=n_jobs) nbrs.fit(X) bandwidth = 0. for batch in gen_batches(len(X), 500): d, _ = nbrs.kneighbors(X[batch, :], return_distance=True) bandwidth += np.max(d, axis=1).sum() return bandwidth / X.shape[0] # separate function for each seed's iterative loop def _mean_shift_single_seed(my_mean, X, nbrs, max_iter): # For each seed, climb gradient until convergence or max_iter bandwidth = nbrs.get_params()['radius'] stop_thresh = 1e-3 * bandwidth # when mean has converged completed_iterations = 0 while True: # Find mean of points within bandwidth i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth, return_distance=False)[0] points_within = X[i_nbrs] if len(points_within) == 0: break # Depending on seeding strategy this condition may occur my_old_mean = my_mean # save the old mean my_mean = np.mean(points_within, axis=0) # If converged or at max_iter, adds the cluster if (np.linalg.norm(my_mean - my_old_mean) < stop_thresh or completed_iterations == max_iter): return tuple(my_mean), len(points_within) completed_iterations += 1 def mean_shift(X, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, max_iter=300, n_jobs=1): """Perform mean shift clustering of data using a flat kernel. Read more in the :ref:`User Guide `. Parameters ---------- X : array-like, shape=[n_samples, n_features] Input data. bandwidth : float, optional Kernel bandwidth. If bandwidth is not given, it is determined using a heuristic based on the median of all pairwise distances. This will take quadratic time in the number of samples. The sklearn.cluster.estimate_bandwidth function can be used to do this more efficiently. seeds : array-like, shape=[n_seeds, n_features] or None Point used as initial kernel locations. If None and bin_seeding=False, each data point is used as a seed. If None and bin_seeding=True, see bin_seeding. bin_seeding : boolean, default=False If true, initial kernel locations are not locations of all points, but rather the location of the discretized version of points, where points are binned onto a grid whose coarseness corresponds to the bandwidth. Setting this option to True will speed up the algorithm because fewer seeds will be initialized. Ignored if seeds argument is not None. min_bin_freq : int, default=1 To speed up the algorithm, accept only those bins with at least min_bin_freq points as seeds. cluster_all : boolean, default True If true, then all points are clustered, even those orphans that are not within any kernel. Orphans are assigned to the nearest kernel. If false, then orphans are given cluster label -1. max_iter : int, default 300 Maximum number of iterations, per seed point before the clustering operation terminates (for that seed point), if has not converged yet. n_jobs : int The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. .. versionadded:: 0.17 Parallel Execution using *n_jobs*. Returns ------- cluster_centers : array, shape=[n_clusters, n_features] Coordinates of cluster centers. labels : array, shape=[n_samples] Cluster labels for each point. Notes ----- For an example, see :ref:`examples/cluster/plot_mean_shift.py `. """ if bandwidth is None: bandwidth = estimate_bandwidth(X, n_jobs=n_jobs) elif bandwidth <= 0: raise ValueError("bandwidth needs to be greater than zero or None,\ got %f" % bandwidth) if seeds is None: if bin_seeding: seeds = get_bin_seeds(X, bandwidth, min_bin_freq) else: seeds = X n_samples, n_features = X.shape center_intensity_dict = {} nbrs = NearestNeighbors(radius=bandwidth, n_jobs=n_jobs).fit(X) # execute iterations on all seeds in parallel all_res = Parallel(n_jobs=n_jobs)( delayed(_mean_shift_single_seed) (seed, X, nbrs, max_iter) for seed in seeds) # copy results in a dictionary for i in range(len(seeds)): if all_res[i] is not None: center_intensity_dict[all_res[i][0]] = all_res[i][1] if not center_intensity_dict: # nothing near seeds raise ValueError("No point was within bandwidth=%f of any seed." " Try a different seeding strategy \ or increase the bandwidth." % bandwidth) # POST PROCESSING: remove near duplicate points # If the distance between two kernels is less than the bandwidth, # then we have to remove one because it is a duplicate. Remove the # one with fewer points. sorted_by_intensity = sorted(center_intensity_dict.items(), key=lambda tup: tup[1], reverse=True) sorted_centers = np.array([tup[0] for tup in sorted_by_intensity]) unique = np.ones(len(sorted_centers), dtype=np.bool) nbrs = NearestNeighbors(radius=bandwidth, n_jobs=n_jobs).fit(sorted_centers) for i, center in enumerate(sorted_centers): if unique[i]: neighbor_idxs = nbrs.radius_neighbors([center], return_distance=False)[0] unique[neighbor_idxs] = 0 unique[i] = 1 # leave the current point as unique cluster_centers = sorted_centers[unique] # ASSIGN LABELS: a point belongs to the cluster that it is closest to nbrs = NearestNeighbors(n_neighbors=1, n_jobs=n_jobs).fit(cluster_centers) labels = np.zeros(n_samples, dtype=np.int) distances, idxs = nbrs.kneighbors(X) if cluster_all: labels = idxs.flatten() else: labels.fill(-1) bool_selector = distances.flatten() <= bandwidth labels[bool_selector] = idxs.flatten()[bool_selector] return cluster_centers, labels def get_bin_seeds(X, bin_size, min_bin_freq=1): """Finds seeds for mean_shift. Finds seeds by first binning data onto a grid whose lines are spaced bin_size apart, and then choosing those bins with at least min_bin_freq points. Parameters ---------- X : array-like, shape=[n_samples, n_features] Input points, the same points that will be used in mean_shift. bin_size : float Controls the coarseness of the binning. Smaller values lead to more seeding (which is computationally more expensive). If you're not sure how to set this, set it to the value of the bandwidth used in clustering.mean_shift. min_bin_freq : integer, optional Only bins with at least min_bin_freq will be selected as seeds. Raising this value decreases the number of seeds found, which makes mean_shift computationally cheaper. Returns ------- bin_seeds : array-like, shape=[n_samples, n_features] Points used as initial kernel positions in clustering.mean_shift. """ # Bin points bin_sizes = defaultdict(int) for point in X: binned_point = np.round(point / bin_size) bin_sizes[tuple(binned_point)] += 1 # Select only those bins as seeds which have enough members bin_seeds = np.array([point for point, freq in six.iteritems(bin_sizes) if freq >= min_bin_freq], dtype=np.float32) if len(bin_seeds) == len(X): warnings.warn("Binning data failed with provided bin_size=%f," " using data points as seeds." % bin_size) return X bin_seeds = bin_seeds * bin_size return bin_seeds class MeanShift(BaseEstimator, ClusterMixin): """Mean shift clustering using a flat kernel. Mean shift clustering aims to discover "blobs" in a smooth density of samples. It is a centroid-based algorithm, which works by updating candidates for centroids to be the mean of the points within a given region. These candidates are then filtered in a post-processing stage to eliminate near-duplicates to form the final set of centroids. Seeding is performed using a binning technique for scalability. Read more in the :ref:`User Guide `. Parameters ---------- bandwidth : float, optional Bandwidth used in the RBF kernel. If not given, the bandwidth is estimated using sklearn.cluster.estimate_bandwidth; see the documentation for that function for hints on scalability (see also the Notes, below). seeds : array, shape=[n_samples, n_features], optional Seeds used to initialize kernels. If not set, the seeds are calculated by clustering.get_bin_seeds with bandwidth as the grid size and default values for other parameters. bin_seeding : boolean, optional If true, initial kernel locations are not locations of all points, but rather the location of the discretized version of points, where points are binned onto a grid whose coarseness corresponds to the bandwidth. Setting this option to True will speed up the algorithm because fewer seeds will be initialized. default value: False Ignored if seeds argument is not None. min_bin_freq : int, optional To speed up the algorithm, accept only those bins with at least min_bin_freq points as seeds. If not defined, set to 1. cluster_all : boolean, default True If true, then all points are clustered, even those orphans that are not within any kernel. Orphans are assigned to the nearest kernel. If false, then orphans are given cluster label -1. n_jobs : int The number of jobs to use for the computation. This works by computing each of the n_init runs in parallel. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Attributes ---------- cluster_centers_ : array, [n_clusters, n_features] Coordinates of cluster centers. labels_ : Labels of each point. Notes ----- Scalability: Because this implementation uses a flat kernel and a Ball Tree to look up members of each kernel, the complexity will tend towards O(T*n*log(n)) in lower dimensions, with n the number of samples and T the number of points. In higher dimensions the complexity will tend towards O(T*n^2). Scalability can be boosted by using fewer seeds, for example by using a higher value of min_bin_freq in the get_bin_seeds function. Note that the estimate_bandwidth function is much less scalable than the mean shift algorithm and will be the bottleneck if it is used. References ---------- Dorin Comaniciu and Peter Meer, "Mean Shift: A robust approach toward feature space analysis". IEEE Transactions on Pattern Analysis and Machine Intelligence. 2002. pp. 603-619. """ def __init__(self, bandwidth=None, seeds=None, bin_seeding=False, min_bin_freq=1, cluster_all=True, n_jobs=1): self.bandwidth = bandwidth self.seeds = seeds self.bin_seeding = bin_seeding self.cluster_all = cluster_all self.min_bin_freq = min_bin_freq self.n_jobs = n_jobs def fit(self, X, y=None): """Perform clustering. Parameters ----------- X : array-like, shape=[n_samples, n_features] Samples to cluster. y : Ignored """ X = check_array(X) self.cluster_centers_, self.labels_ = \ mean_shift(X, bandwidth=self.bandwidth, seeds=self.seeds, min_bin_freq=self.min_bin_freq, bin_seeding=self.bin_seeding, cluster_all=self.cluster_all, n_jobs=self.n_jobs) return self def predict(self, X): """Predict the closest cluster each sample in X belongs to. Parameters ---------- X : {array-like, sparse matrix}, shape=[n_samples, n_features] New data to predict. Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ check_is_fitted(self, "cluster_centers_") return pairwise_distances_argmin(X, self.cluster_centers_)