laywerrobot/lib/python3.6/site-packages/sklearn/metrics/cluster/tests/test_unsupervised.py

import numpy as np
import scipy.sparse as sp
from scipy.sparse import csr_matrix

from sklearn import datasets
from sklearn.utils.testing import assert_false
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_raises_regexp
from sklearn.utils.testing import assert_raise_message
from sklearn.utils.testing import assert_greater
from sklearn.metrics.cluster import silhouette_score
from sklearn.metrics.cluster import silhouette_samples
from sklearn.metrics import pairwise_distances
from sklearn.metrics.cluster import calinski_harabaz_score


def test_silhouette():
    # Tests the Silhouette Coefficient.
    dataset = datasets.load_iris()
    X_dense = dataset.data
    X_csr = csr_matrix(X_dense)
    X_dok = sp.dok_matrix(X_dense)
    X_lil = sp.lil_matrix(X_dense)
    y = dataset.target

    for X in [X_dense, X_csr, X_dok, X_lil]:
        D = pairwise_distances(X, metric='euclidean')
        # Given that the actual labels are used, we can assume that S would be
        # positive.
        score_precomputed = silhouette_score(D, y, metric='precomputed')
        assert_greater(score_precomputed, 0)
        # Test without calculating D
        score_euclidean = silhouette_score(X, y, metric='euclidean')
        assert_almost_equal(score_precomputed, score_euclidean)

        if X is X_dense:
            score_dense_without_sampling = score_precomputed
        else:
            assert_almost_equal(score_euclidean,
                                score_dense_without_sampling)

        # Test with sampling
        score_precomputed = silhouette_score(D, y, metric='precomputed',
                                             sample_size=int(X.shape[0] / 2),
                                             random_state=0)
        score_euclidean = silhouette_score(X, y, metric='euclidean',
                                           sample_size=int(X.shape[0] / 2),
                                           random_state=0)
        assert_greater(score_precomputed, 0)
        assert_greater(score_euclidean, 0)
        assert_almost_equal(score_euclidean, score_precomputed)

        if X is X_dense:
            score_dense_with_sampling = score_precomputed
        else:
            assert_almost_equal(score_euclidean, score_dense_with_sampling)


def test_cluster_size_1():
    # Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster
    # (cluster 0). We also test the case where there are identical samples
    # as the only members of a cluster (cluster 2). To our knowledge, this case
    # is not discussed in reference material, and we choose for it a sample
    # score of 1.
    X = [[0.], [1.], [1.], [2.], [3.], [3.]]
    labels = np.array([0, 1, 1, 1, 2, 2])

    # Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention
    # Cluster 1: intra-cluster = [.5, .5, 1]
    #            inter-cluster = [1, 1, 1]
    #            silhouette    = [.5, .5, 0]
    # Cluster 2: intra-cluster = [0, 0]
    #            inter-cluster = [arbitrary, arbitrary]
    #            silhouette    = [1., 1.]

    silhouette = silhouette_score(X, labels)
    assert_false(np.isnan(silhouette))
    ss = silhouette_samples(X, labels)
    assert_array_equal(ss, [0, .5, .5, 0, 1, 1])


def test_correct_labelsize():
    # Assert 1 < n_labels < n_samples
    dataset = datasets.load_iris()
    X = dataset.data

    # n_labels = n_samples
    y = np.arange(X.shape[0])
    assert_raises_regexp(ValueError,
                         'Number of labels is %d\. Valid values are 2 '
                         'to n_samples - 1 \(inclusive\)' % len(np.unique(y)),
                         silhouette_score, X, y)

    # n_labels = 1
    y = np.zeros(X.shape[0])
    assert_raises_regexp(ValueError,
                         'Number of labels is %d\. Valid values are 2 '
                         'to n_samples - 1 \(inclusive\)' % len(np.unique(y)),
                         silhouette_score, X, y)


def test_non_encoded_labels():
    dataset = datasets.load_iris()
    X = dataset.data
    labels = dataset.target
    assert_equal(
        silhouette_score(X, labels * 2 + 10), silhouette_score(X, labels))
    assert_array_equal(
        silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels))


def test_non_numpy_labels():
    dataset = datasets.load_iris()
    X = dataset.data
    y = dataset.target
    assert_equal(
        silhouette_score(list(X), list(y)), silhouette_score(X, y))


def test_calinski_harabaz_score():
    rng = np.random.RandomState(seed=0)

    # Assert message when there is only one label
    assert_raise_message(ValueError, "Number of labels is",
                         calinski_harabaz_score,
                         rng.rand(10, 2), np.zeros(10))

    # Assert message when all point are in different clusters
    assert_raise_message(ValueError, "Number of labels is",
                         calinski_harabaz_score,
                         rng.rand(10, 2), np.arange(10))

    # Assert the value is 1. when all samples are equals
    assert_equal(1., calinski_harabaz_score(np.ones((10, 2)),
                                            [0] * 5 + [1] * 5))

    # Assert the value is 0. when all the mean cluster are equal
    assert_equal(0., calinski_harabaz_score([[-1, -1], [1, 1]] * 10,
                                            [0] * 10 + [1] * 10))

    # General case (with non numpy arrays)
    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
         [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
    assert_almost_equal(calinski_harabaz_score(X, labels),
                        45 * (40 - 4) / (5 * (4 - 1)))
first commit 2020-08-27 21:55:39 +02:00			`import numpy as np`
			`import scipy.sparse as sp`
			`from scipy.sparse import csr_matrix`

			`from sklearn import datasets`
			`from sklearn.utils.testing import assert_false`
			`from sklearn.utils.testing import assert_almost_equal`
			`from sklearn.utils.testing import assert_array_equal`
			`from sklearn.utils.testing import assert_equal`
			`from sklearn.utils.testing import assert_raises_regexp`
			`from sklearn.utils.testing import assert_raise_message`
			`from sklearn.utils.testing import assert_greater`
			`from sklearn.metrics.cluster import silhouette_score`
			`from sklearn.metrics.cluster import silhouette_samples`
			`from sklearn.metrics import pairwise_distances`
			`from sklearn.metrics.cluster import calinski_harabaz_score`


			`def test_silhouette():`
			`# Tests the Silhouette Coefficient.`
			`dataset = datasets.load_iris()`
			`X_dense = dataset.data`
			`X_csr = csr_matrix(X_dense)`
			`X_dok = sp.dok_matrix(X_dense)`
			`X_lil = sp.lil_matrix(X_dense)`
			`y = dataset.target`

			`for X in [X_dense, X_csr, X_dok, X_lil]:`
			`D = pairwise_distances(X, metric='euclidean')`
			`# Given that the actual labels are used, we can assume that S would be`
			`# positive.`
			`score_precomputed = silhouette_score(D, y, metric='precomputed')`
			`assert_greater(score_precomputed, 0)`
			`# Test without calculating D`
			`score_euclidean = silhouette_score(X, y, metric='euclidean')`
			`assert_almost_equal(score_precomputed, score_euclidean)`

			`if X is X_dense:`
			`score_dense_without_sampling = score_precomputed`
			`else:`
			`assert_almost_equal(score_euclidean,`
			`score_dense_without_sampling)`

			`# Test with sampling`
			`score_precomputed = silhouette_score(D, y, metric='precomputed',`
			`sample_size=int(X.shape[0] / 2),`
			`random_state=0)`
			`score_euclidean = silhouette_score(X, y, metric='euclidean',`
			`sample_size=int(X.shape[0] / 2),`
			`random_state=0)`
			`assert_greater(score_precomputed, 0)`
			`assert_greater(score_euclidean, 0)`
			`assert_almost_equal(score_euclidean, score_precomputed)`

			`if X is X_dense:`
			`score_dense_with_sampling = score_precomputed`
			`else:`
			`assert_almost_equal(score_euclidean, score_dense_with_sampling)`


			`def test_cluster_size_1():`
			`# Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster`
			`# (cluster 0). We also test the case where there are identical samples`
			`# as the only members of a cluster (cluster 2). To our knowledge, this case`
			`# is not discussed in reference material, and we choose for it a sample`
			`# score of 1.`
			`X = [[0.], [1.], [1.], [2.], [3.], [3.]]`
			`labels = np.array([0, 1, 1, 1, 2, 2])`

			`# Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention`
			`# Cluster 1: intra-cluster = [.5, .5, 1]`
			`# inter-cluster = [1, 1, 1]`
			`# silhouette = [.5, .5, 0]`
			`# Cluster 2: intra-cluster = [0, 0]`
			`# inter-cluster = [arbitrary, arbitrary]`
			`# silhouette = [1., 1.]`

			`silhouette = silhouette_score(X, labels)`
			`assert_false(np.isnan(silhouette))`
			`ss = silhouette_samples(X, labels)`
			`assert_array_equal(ss, [0, .5, .5, 0, 1, 1])`


			`def test_correct_labelsize():`
			`# Assert 1 < n_labels < n_samples`
			`dataset = datasets.load_iris()`
			`X = dataset.data`

			`# n_labels = n_samples`
			`y = np.arange(X.shape[0])`
			`assert_raises_regexp(ValueError,`
			`'Number of labels is %d\. Valid values are 2 '`
			`'to n_samples - 1 \(inclusive\)' % len(np.unique(y)),`
			`silhouette_score, X, y)`

			`# n_labels = 1`
			`y = np.zeros(X.shape[0])`
			`assert_raises_regexp(ValueError,`
			`'Number of labels is %d\. Valid values are 2 '`
			`'to n_samples - 1 \(inclusive\)' % len(np.unique(y)),`
			`silhouette_score, X, y)`


			`def test_non_encoded_labels():`
			`dataset = datasets.load_iris()`
			`X = dataset.data`
			`labels = dataset.target`
			`assert_equal(`
			`silhouette_score(X, labels * 2 + 10), silhouette_score(X, labels))`
			`assert_array_equal(`
			`silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels))`


			`def test_non_numpy_labels():`
			`dataset = datasets.load_iris()`
			`X = dataset.data`
			`y = dataset.target`
			`assert_equal(`
			`silhouette_score(list(X), list(y)), silhouette_score(X, y))`


			`def test_calinski_harabaz_score():`
			`rng = np.random.RandomState(seed=0)`

			`# Assert message when there is only one label`
			`assert_raise_message(ValueError, "Number of labels is",`
			`calinski_harabaz_score,`
			`rng.rand(10, 2), np.zeros(10))`

			`# Assert message when all point are in different clusters`
			`assert_raise_message(ValueError, "Number of labels is",`
			`calinski_harabaz_score,`
			`rng.rand(10, 2), np.arange(10))`

			`# Assert the value is 1. when all samples are equals`
			`assert_equal(1., calinski_harabaz_score(np.ones((10, 2)),`
			`[0] * 5 + [1] * 5))`

			`# Assert the value is 0. when all the mean cluster are equal`
			`assert_equal(0., calinski_harabaz_score([[-1, -1], [1, 1]] * 10,`
			`[0] * 10 + [1] * 10))`

			`# General case (with non numpy arrays)`
			`X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +`
			`[[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)`
			`labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10`
			`assert_almost_equal(calinski_harabaz_score(X, labels),`
			`45 * (40 - 4) / (5 * (4 - 1)))`