laywerrobot/lib/python3.6/site-packages/sklearn/metrics/cluster/tests/test_unsupervised.py
2020-08-27 21:55:39 +02:00

148 lines
5.7 KiB
Python

import numpy as np
import scipy.sparse as sp
from scipy.sparse import csr_matrix
from sklearn import datasets
from sklearn.utils.testing import assert_false
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_raises_regexp
from sklearn.utils.testing import assert_raise_message
from sklearn.utils.testing import assert_greater
from sklearn.metrics.cluster import silhouette_score
from sklearn.metrics.cluster import silhouette_samples
from sklearn.metrics import pairwise_distances
from sklearn.metrics.cluster import calinski_harabaz_score
def test_silhouette():
# Tests the Silhouette Coefficient.
dataset = datasets.load_iris()
X_dense = dataset.data
X_csr = csr_matrix(X_dense)
X_dok = sp.dok_matrix(X_dense)
X_lil = sp.lil_matrix(X_dense)
y = dataset.target
for X in [X_dense, X_csr, X_dok, X_lil]:
D = pairwise_distances(X, metric='euclidean')
# Given that the actual labels are used, we can assume that S would be
# positive.
score_precomputed = silhouette_score(D, y, metric='precomputed')
assert_greater(score_precomputed, 0)
# Test without calculating D
score_euclidean = silhouette_score(X, y, metric='euclidean')
assert_almost_equal(score_precomputed, score_euclidean)
if X is X_dense:
score_dense_without_sampling = score_precomputed
else:
assert_almost_equal(score_euclidean,
score_dense_without_sampling)
# Test with sampling
score_precomputed = silhouette_score(D, y, metric='precomputed',
sample_size=int(X.shape[0] / 2),
random_state=0)
score_euclidean = silhouette_score(X, y, metric='euclidean',
sample_size=int(X.shape[0] / 2),
random_state=0)
assert_greater(score_precomputed, 0)
assert_greater(score_euclidean, 0)
assert_almost_equal(score_euclidean, score_precomputed)
if X is X_dense:
score_dense_with_sampling = score_precomputed
else:
assert_almost_equal(score_euclidean, score_dense_with_sampling)
def test_cluster_size_1():
# Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster
# (cluster 0). We also test the case where there are identical samples
# as the only members of a cluster (cluster 2). To our knowledge, this case
# is not discussed in reference material, and we choose for it a sample
# score of 1.
X = [[0.], [1.], [1.], [2.], [3.], [3.]]
labels = np.array([0, 1, 1, 1, 2, 2])
# Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention
# Cluster 1: intra-cluster = [.5, .5, 1]
# inter-cluster = [1, 1, 1]
# silhouette = [.5, .5, 0]
# Cluster 2: intra-cluster = [0, 0]
# inter-cluster = [arbitrary, arbitrary]
# silhouette = [1., 1.]
silhouette = silhouette_score(X, labels)
assert_false(np.isnan(silhouette))
ss = silhouette_samples(X, labels)
assert_array_equal(ss, [0, .5, .5, 0, 1, 1])
def test_correct_labelsize():
# Assert 1 < n_labels < n_samples
dataset = datasets.load_iris()
X = dataset.data
# n_labels = n_samples
y = np.arange(X.shape[0])
assert_raises_regexp(ValueError,
'Number of labels is %d\. Valid values are 2 '
'to n_samples - 1 \(inclusive\)' % len(np.unique(y)),
silhouette_score, X, y)
# n_labels = 1
y = np.zeros(X.shape[0])
assert_raises_regexp(ValueError,
'Number of labels is %d\. Valid values are 2 '
'to n_samples - 1 \(inclusive\)' % len(np.unique(y)),
silhouette_score, X, y)
def test_non_encoded_labels():
dataset = datasets.load_iris()
X = dataset.data
labels = dataset.target
assert_equal(
silhouette_score(X, labels * 2 + 10), silhouette_score(X, labels))
assert_array_equal(
silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels))
def test_non_numpy_labels():
dataset = datasets.load_iris()
X = dataset.data
y = dataset.target
assert_equal(
silhouette_score(list(X), list(y)), silhouette_score(X, y))
def test_calinski_harabaz_score():
rng = np.random.RandomState(seed=0)
# Assert message when there is only one label
assert_raise_message(ValueError, "Number of labels is",
calinski_harabaz_score,
rng.rand(10, 2), np.zeros(10))
# Assert message when all point are in different clusters
assert_raise_message(ValueError, "Number of labels is",
calinski_harabaz_score,
rng.rand(10, 2), np.arange(10))
# Assert the value is 1. when all samples are equals
assert_equal(1., calinski_harabaz_score(np.ones((10, 2)),
[0] * 5 + [1] * 5))
# Assert the value is 0. when all the mean cluster are equal
assert_equal(0., calinski_harabaz_score([[-1, -1], [1, 1]] * 10,
[0] * 10 + [1] * 10))
# General case (with non numpy arrays)
X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
[[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
assert_almost_equal(calinski_harabaz_score(X, labels),
45 * (40 - 4) / (5 * (4 - 1)))