318 lines
13 KiB
Python
318 lines
13 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
"""
|
||
|
DBSCAN: Density-Based Spatial Clustering of Applications with Noise
|
||
|
"""
|
||
|
|
||
|
# Author: Robert Layton <robertlayton@gmail.com>
|
||
|
# Joel Nothman <joel.nothman@gmail.com>
|
||
|
# Lars Buitinck
|
||
|
#
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
import numpy as np
|
||
|
from scipy import sparse
|
||
|
|
||
|
from ..base import BaseEstimator, ClusterMixin
|
||
|
from ..utils import check_array, check_consistent_length
|
||
|
from ..neighbors import NearestNeighbors
|
||
|
|
||
|
from ._dbscan_inner import dbscan_inner
|
||
|
|
||
|
|
||
|
def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
|
||
|
algorithm='auto', leaf_size=30, p=2, sample_weight=None, n_jobs=1):
|
||
|
"""Perform DBSCAN clustering from vector array or distance matrix.
|
||
|
|
||
|
Read more in the :ref:`User Guide <dbscan>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
|
||
|
array of shape (n_samples, n_samples)
|
||
|
A feature array, or array of distances between samples if
|
||
|
``metric='precomputed'``.
|
||
|
|
||
|
eps : float, optional
|
||
|
The maximum distance between two samples for them to be considered
|
||
|
as in the same neighborhood.
|
||
|
|
||
|
min_samples : int, optional
|
||
|
The number of samples (or total weight) in a neighborhood for a point
|
||
|
to be considered as a core point. This includes the point itself.
|
||
|
|
||
|
metric : string, or callable
|
||
|
The metric to use when calculating distance between instances in a
|
||
|
feature array. If metric is a string or callable, it must be one of
|
||
|
the options allowed by metrics.pairwise.pairwise_distances for its
|
||
|
metric parameter.
|
||
|
If metric is "precomputed", X is assumed to be a distance matrix and
|
||
|
must be square. X may be a sparse matrix, in which case only "nonzero"
|
||
|
elements may be considered neighbors for DBSCAN.
|
||
|
|
||
|
metric_params : dict, optional
|
||
|
Additional keyword arguments for the metric function.
|
||
|
|
||
|
.. versionadded:: 0.19
|
||
|
|
||
|
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
|
||
|
The algorithm to be used by the NearestNeighbors module
|
||
|
to compute pointwise distances and find nearest neighbors.
|
||
|
See NearestNeighbors module documentation for details.
|
||
|
|
||
|
leaf_size : int, optional (default = 30)
|
||
|
Leaf size passed to BallTree or cKDTree. This can affect the speed
|
||
|
of the construction and query, as well as the memory required
|
||
|
to store the tree. The optimal value depends
|
||
|
on the nature of the problem.
|
||
|
|
||
|
p : float, optional
|
||
|
The power of the Minkowski metric to be used to calculate distance
|
||
|
between points.
|
||
|
|
||
|
sample_weight : array, shape (n_samples,), optional
|
||
|
Weight of each sample, such that a sample with a weight of at least
|
||
|
``min_samples`` is by itself a core sample; a sample with negative
|
||
|
weight may inhibit its eps-neighbor from being core.
|
||
|
Note that weights are absolute, and default to 1.
|
||
|
|
||
|
n_jobs : int, optional (default = 1)
|
||
|
The number of parallel jobs to run for neighbors search.
|
||
|
If ``-1``, then the number of jobs is set to the number of CPU cores.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
core_samples : array [n_core_samples]
|
||
|
Indices of core samples.
|
||
|
|
||
|
labels : array [n_samples]
|
||
|
Cluster labels for each point. Noisy samples are given the label -1.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
For an example, see :ref:`examples/cluster/plot_dbscan.py
|
||
|
<sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
|
||
|
|
||
|
This implementation bulk-computes all neighborhood queries, which increases
|
||
|
the memory complexity to O(n.d) where d is the average number of neighbors,
|
||
|
while original DBSCAN had memory complexity O(n).
|
||
|
|
||
|
Sparse neighborhoods can be precomputed using
|
||
|
:func:`NearestNeighbors.radius_neighbors_graph
|
||
|
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>`
|
||
|
with ``mode='distance'``.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
|
||
|
Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
|
||
|
In: Proceedings of the 2nd International Conference on Knowledge Discovery
|
||
|
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
|
||
|
"""
|
||
|
if not eps > 0.0:
|
||
|
raise ValueError("eps must be positive.")
|
||
|
|
||
|
X = check_array(X, accept_sparse='csr')
|
||
|
if sample_weight is not None:
|
||
|
sample_weight = np.asarray(sample_weight)
|
||
|
check_consistent_length(X, sample_weight)
|
||
|
|
||
|
# Calculate neighborhood for all samples. This leaves the original point
|
||
|
# in, which needs to be considered later (i.e. point i is in the
|
||
|
# neighborhood of point i. While True, its useless information)
|
||
|
if metric == 'precomputed' and sparse.issparse(X):
|
||
|
neighborhoods = np.empty(X.shape[0], dtype=object)
|
||
|
X.sum_duplicates() # XXX: modifies X's internals in-place
|
||
|
X_mask = X.data <= eps
|
||
|
masked_indices = X.indices.astype(np.intp, copy=False)[X_mask]
|
||
|
masked_indptr = np.concatenate(([0], np.cumsum(X_mask)))[X.indptr[1:]]
|
||
|
|
||
|
# insert the diagonal: a point is its own neighbor, but 0 distance
|
||
|
# means absence from sparse matrix data
|
||
|
masked_indices = np.insert(masked_indices, masked_indptr,
|
||
|
np.arange(X.shape[0]))
|
||
|
masked_indptr = masked_indptr[:-1] + np.arange(1, X.shape[0])
|
||
|
# split into rows
|
||
|
neighborhoods[:] = np.split(masked_indices, masked_indptr)
|
||
|
else:
|
||
|
neighbors_model = NearestNeighbors(radius=eps, algorithm=algorithm,
|
||
|
leaf_size=leaf_size,
|
||
|
metric=metric,
|
||
|
metric_params=metric_params, p=p,
|
||
|
n_jobs=n_jobs)
|
||
|
neighbors_model.fit(X)
|
||
|
# This has worst case O(n^2) memory complexity
|
||
|
neighborhoods = neighbors_model.radius_neighbors(X, eps,
|
||
|
return_distance=False)
|
||
|
|
||
|
if sample_weight is None:
|
||
|
n_neighbors = np.array([len(neighbors)
|
||
|
for neighbors in neighborhoods])
|
||
|
else:
|
||
|
n_neighbors = np.array([np.sum(sample_weight[neighbors])
|
||
|
for neighbors in neighborhoods])
|
||
|
|
||
|
# Initially, all samples are noise.
|
||
|
labels = -np.ones(X.shape[0], dtype=np.intp)
|
||
|
|
||
|
# A list of all core samples found.
|
||
|
core_samples = np.asarray(n_neighbors >= min_samples, dtype=np.uint8)
|
||
|
dbscan_inner(core_samples, neighborhoods, labels)
|
||
|
return np.where(core_samples)[0], labels
|
||
|
|
||
|
|
||
|
class DBSCAN(BaseEstimator, ClusterMixin):
|
||
|
"""Perform DBSCAN clustering from vector array or distance matrix.
|
||
|
|
||
|
DBSCAN - Density-Based Spatial Clustering of Applications with Noise.
|
||
|
Finds core samples of high density and expands clusters from them.
|
||
|
Good for data which contains clusters of similar density.
|
||
|
|
||
|
Read more in the :ref:`User Guide <dbscan>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
eps : float, optional
|
||
|
The maximum distance between two samples for them to be considered
|
||
|
as in the same neighborhood.
|
||
|
|
||
|
min_samples : int, optional
|
||
|
The number of samples (or total weight) in a neighborhood for a point
|
||
|
to be considered as a core point. This includes the point itself.
|
||
|
|
||
|
metric : string, or callable
|
||
|
The metric to use when calculating distance between instances in a
|
||
|
feature array. If metric is a string or callable, it must be one of
|
||
|
the options allowed by metrics.pairwise.calculate_distance for its
|
||
|
metric parameter.
|
||
|
If metric is "precomputed", X is assumed to be a distance matrix and
|
||
|
must be square. X may be a sparse matrix, in which case only "nonzero"
|
||
|
elements may be considered neighbors for DBSCAN.
|
||
|
|
||
|
.. versionadded:: 0.17
|
||
|
metric *precomputed* to accept precomputed sparse matrix.
|
||
|
|
||
|
metric_params : dict, optional
|
||
|
Additional keyword arguments for the metric function.
|
||
|
|
||
|
.. versionadded:: 0.19
|
||
|
|
||
|
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
|
||
|
The algorithm to be used by the NearestNeighbors module
|
||
|
to compute pointwise distances and find nearest neighbors.
|
||
|
See NearestNeighbors module documentation for details.
|
||
|
|
||
|
leaf_size : int, optional (default = 30)
|
||
|
Leaf size passed to BallTree or cKDTree. This can affect the speed
|
||
|
of the construction and query, as well as the memory required
|
||
|
to store the tree. The optimal value depends
|
||
|
on the nature of the problem.
|
||
|
|
||
|
p : float, optional
|
||
|
The power of the Minkowski metric to be used to calculate distance
|
||
|
between points.
|
||
|
|
||
|
n_jobs : int, optional (default = 1)
|
||
|
The number of parallel jobs to run.
|
||
|
If ``-1``, then the number of jobs is set to the number of CPU cores.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
core_sample_indices_ : array, shape = [n_core_samples]
|
||
|
Indices of core samples.
|
||
|
|
||
|
components_ : array, shape = [n_core_samples, n_features]
|
||
|
Copy of each core sample found by training.
|
||
|
|
||
|
labels_ : array, shape = [n_samples]
|
||
|
Cluster labels for each point in the dataset given to fit().
|
||
|
Noisy samples are given the label -1.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
For an example, see :ref:`examples/cluster/plot_dbscan.py
|
||
|
<sphx_glr_auto_examples_cluster_plot_dbscan.py>`.
|
||
|
|
||
|
This implementation bulk-computes all neighborhood queries, which increases
|
||
|
the memory complexity to O(n.d) where d is the average number of neighbors,
|
||
|
while original DBSCAN had memory complexity O(n).
|
||
|
|
||
|
Sparse neighborhoods can be precomputed using
|
||
|
:func:`NearestNeighbors.radius_neighbors_graph
|
||
|
<sklearn.neighbors.NearestNeighbors.radius_neighbors_graph>`
|
||
|
with ``mode='distance'``.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, "A Density-Based
|
||
|
Algorithm for Discovering Clusters in Large Spatial Databases with Noise".
|
||
|
In: Proceedings of the 2nd International Conference on Knowledge Discovery
|
||
|
and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996
|
||
|
"""
|
||
|
|
||
|
def __init__(self, eps=0.5, min_samples=5, metric='euclidean',
|
||
|
metric_params=None, algorithm='auto', leaf_size=30, p=None,
|
||
|
n_jobs=1):
|
||
|
self.eps = eps
|
||
|
self.min_samples = min_samples
|
||
|
self.metric = metric
|
||
|
self.metric_params = metric_params
|
||
|
self.algorithm = algorithm
|
||
|
self.leaf_size = leaf_size
|
||
|
self.p = p
|
||
|
self.n_jobs = n_jobs
|
||
|
|
||
|
def fit(self, X, y=None, sample_weight=None):
|
||
|
"""Perform DBSCAN clustering from features or distance matrix.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
|
||
|
array of shape (n_samples, n_samples)
|
||
|
A feature array, or array of distances between samples if
|
||
|
``metric='precomputed'``.
|
||
|
sample_weight : array, shape (n_samples,), optional
|
||
|
Weight of each sample, such that a sample with a weight of at least
|
||
|
``min_samples`` is by itself a core sample; a sample with negative
|
||
|
weight may inhibit its eps-neighbor from being core.
|
||
|
Note that weights are absolute, and default to 1.
|
||
|
|
||
|
y : Ignored
|
||
|
|
||
|
"""
|
||
|
X = check_array(X, accept_sparse='csr')
|
||
|
clust = dbscan(X, sample_weight=sample_weight,
|
||
|
**self.get_params())
|
||
|
self.core_sample_indices_, self.labels_ = clust
|
||
|
if len(self.core_sample_indices_):
|
||
|
# fix for scipy sparse indexing issue
|
||
|
self.components_ = X[self.core_sample_indices_].copy()
|
||
|
else:
|
||
|
# no core samples
|
||
|
self.components_ = np.empty((0, X.shape[1]))
|
||
|
return self
|
||
|
|
||
|
def fit_predict(self, X, y=None, sample_weight=None):
|
||
|
"""Performs clustering on X and returns cluster labels.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
|
||
|
array of shape (n_samples, n_samples)
|
||
|
A feature array, or array of distances between samples if
|
||
|
``metric='precomputed'``.
|
||
|
sample_weight : array, shape (n_samples,), optional
|
||
|
Weight of each sample, such that a sample with a weight of at least
|
||
|
``min_samples`` is by itself a core sample; a sample with negative
|
||
|
weight may inhibit its eps-neighbor from being core.
|
||
|
Note that weights are absolute, and default to 1.
|
||
|
|
||
|
y : Ignored
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
y : ndarray, shape (n_samples,)
|
||
|
cluster labels
|
||
|
"""
|
||
|
self.fit(X, sample_weight=sample_weight)
|
||
|
return self.labels_
|