869 lines
32 KiB
Python
869 lines
32 KiB
Python
"""Testing for K-means"""
|
|
import sys
|
|
|
|
import numpy as np
|
|
from scipy import sparse as sp
|
|
|
|
from sklearn.utils.testing import assert_equal
|
|
from sklearn.utils.testing import assert_array_equal
|
|
from sklearn.utils.testing import assert_array_almost_equal
|
|
from sklearn.utils.testing import SkipTest
|
|
from sklearn.utils.testing import assert_almost_equal
|
|
from sklearn.utils.testing import assert_raises
|
|
from sklearn.utils.testing import assert_raises_regex
|
|
from sklearn.utils.testing import assert_true
|
|
from sklearn.utils.testing import assert_greater
|
|
from sklearn.utils.testing import assert_less
|
|
from sklearn.utils.testing import assert_warns
|
|
from sklearn.utils.testing import if_safe_multiprocessing_with_blas
|
|
from sklearn.utils.testing import assert_raise_message
|
|
|
|
|
|
from sklearn.utils.extmath import row_norms
|
|
from sklearn.metrics.cluster import v_measure_score
|
|
from sklearn.cluster import KMeans, k_means
|
|
from sklearn.cluster import MiniBatchKMeans
|
|
from sklearn.cluster.k_means_ import _labels_inertia
|
|
from sklearn.cluster.k_means_ import _mini_batch_step
|
|
from sklearn.datasets.samples_generator import make_blobs
|
|
from sklearn.externals.six.moves import cStringIO as StringIO
|
|
from sklearn.metrics.cluster import homogeneity_score
|
|
|
|
|
|
# non centered, sparse centers to check the
|
|
centers = np.array([
|
|
[0.0, 5.0, 0.0, 0.0, 0.0],
|
|
[1.0, 1.0, 4.0, 0.0, 0.0],
|
|
[1.0, 0.0, 0.0, 5.0, 1.0],
|
|
])
|
|
n_samples = 100
|
|
n_clusters, n_features = centers.shape
|
|
X, true_labels = make_blobs(n_samples=n_samples, centers=centers,
|
|
cluster_std=1., random_state=42)
|
|
X_csr = sp.csr_matrix(X)
|
|
|
|
|
|
def test_elkan_results():
|
|
rnd = np.random.RandomState(0)
|
|
X_normal = rnd.normal(size=(50, 10))
|
|
X_blobs, _ = make_blobs(random_state=0)
|
|
km_full = KMeans(algorithm='full', n_clusters=5, random_state=0, n_init=1)
|
|
km_elkan = KMeans(algorithm='elkan', n_clusters=5,
|
|
random_state=0, n_init=1)
|
|
for X in [X_normal, X_blobs]:
|
|
km_full.fit(X)
|
|
km_elkan.fit(X)
|
|
assert_array_almost_equal(km_elkan.cluster_centers_,
|
|
km_full.cluster_centers_)
|
|
assert_array_equal(km_elkan.labels_, km_full.labels_)
|
|
|
|
|
|
def test_labels_assignment_and_inertia():
|
|
# pure numpy implementation as easily auditable reference gold
|
|
# implementation
|
|
rng = np.random.RandomState(42)
|
|
noisy_centers = centers + rng.normal(size=centers.shape)
|
|
labels_gold = - np.ones(n_samples, dtype=np.int)
|
|
mindist = np.empty(n_samples)
|
|
mindist.fill(np.infty)
|
|
for center_id in range(n_clusters):
|
|
dist = np.sum((X - noisy_centers[center_id]) ** 2, axis=1)
|
|
labels_gold[dist < mindist] = center_id
|
|
mindist = np.minimum(dist, mindist)
|
|
inertia_gold = mindist.sum()
|
|
assert_true((mindist >= 0.0).all())
|
|
assert_true((labels_gold != -1).all())
|
|
|
|
# perform label assignment using the dense array input
|
|
x_squared_norms = (X ** 2).sum(axis=1)
|
|
labels_array, inertia_array = _labels_inertia(
|
|
X, x_squared_norms, noisy_centers)
|
|
assert_array_almost_equal(inertia_array, inertia_gold)
|
|
assert_array_equal(labels_array, labels_gold)
|
|
|
|
# perform label assignment using the sparse CSR input
|
|
x_squared_norms_from_csr = row_norms(X_csr, squared=True)
|
|
labels_csr, inertia_csr = _labels_inertia(
|
|
X_csr, x_squared_norms_from_csr, noisy_centers)
|
|
assert_array_almost_equal(inertia_csr, inertia_gold)
|
|
assert_array_equal(labels_csr, labels_gold)
|
|
|
|
|
|
def test_minibatch_update_consistency():
|
|
# Check that dense and sparse minibatch update give the same results
|
|
rng = np.random.RandomState(42)
|
|
old_centers = centers + rng.normal(size=centers.shape)
|
|
|
|
new_centers = old_centers.copy()
|
|
new_centers_csr = old_centers.copy()
|
|
|
|
counts = np.zeros(new_centers.shape[0], dtype=np.int32)
|
|
counts_csr = np.zeros(new_centers.shape[0], dtype=np.int32)
|
|
|
|
x_squared_norms = (X ** 2).sum(axis=1)
|
|
x_squared_norms_csr = row_norms(X_csr, squared=True)
|
|
|
|
buffer = np.zeros(centers.shape[1], dtype=np.double)
|
|
buffer_csr = np.zeros(centers.shape[1], dtype=np.double)
|
|
|
|
# extract a small minibatch
|
|
X_mb = X[:10]
|
|
X_mb_csr = X_csr[:10]
|
|
x_mb_squared_norms = x_squared_norms[:10]
|
|
x_mb_squared_norms_csr = x_squared_norms_csr[:10]
|
|
|
|
# step 1: compute the dense minibatch update
|
|
old_inertia, incremental_diff = _mini_batch_step(
|
|
X_mb, x_mb_squared_norms, new_centers, counts,
|
|
buffer, 1, None, random_reassign=False)
|
|
assert_greater(old_inertia, 0.0)
|
|
|
|
# compute the new inertia on the same batch to check that it decreased
|
|
labels, new_inertia = _labels_inertia(
|
|
X_mb, x_mb_squared_norms, new_centers)
|
|
assert_greater(new_inertia, 0.0)
|
|
assert_less(new_inertia, old_inertia)
|
|
|
|
# check that the incremental difference computation is matching the
|
|
# final observed value
|
|
effective_diff = np.sum((new_centers - old_centers) ** 2)
|
|
assert_almost_equal(incremental_diff, effective_diff)
|
|
|
|
# step 2: compute the sparse minibatch update
|
|
old_inertia_csr, incremental_diff_csr = _mini_batch_step(
|
|
X_mb_csr, x_mb_squared_norms_csr, new_centers_csr, counts_csr,
|
|
buffer_csr, 1, None, random_reassign=False)
|
|
assert_greater(old_inertia_csr, 0.0)
|
|
|
|
# compute the new inertia on the same batch to check that it decreased
|
|
labels_csr, new_inertia_csr = _labels_inertia(
|
|
X_mb_csr, x_mb_squared_norms_csr, new_centers_csr)
|
|
assert_greater(new_inertia_csr, 0.0)
|
|
assert_less(new_inertia_csr, old_inertia_csr)
|
|
|
|
# check that the incremental difference computation is matching the
|
|
# final observed value
|
|
effective_diff = np.sum((new_centers_csr - old_centers) ** 2)
|
|
assert_almost_equal(incremental_diff_csr, effective_diff)
|
|
|
|
# step 3: check that sparse and dense updates lead to the same results
|
|
assert_array_equal(labels, labels_csr)
|
|
assert_array_almost_equal(new_centers, new_centers_csr)
|
|
assert_almost_equal(incremental_diff, incremental_diff_csr)
|
|
assert_almost_equal(old_inertia, old_inertia_csr)
|
|
assert_almost_equal(new_inertia, new_inertia_csr)
|
|
|
|
|
|
def _check_fitted_model(km):
|
|
# check that the number of clusters centers and distinct labels match
|
|
# the expectation
|
|
centers = km.cluster_centers_
|
|
assert_equal(centers.shape, (n_clusters, n_features))
|
|
|
|
labels = km.labels_
|
|
assert_equal(np.unique(labels).shape[0], n_clusters)
|
|
|
|
# check that the labels assignment are perfect (up to a permutation)
|
|
assert_equal(v_measure_score(true_labels, labels), 1.0)
|
|
assert_greater(km.inertia_, 0.0)
|
|
|
|
# check error on dataset being too small
|
|
assert_raises(ValueError, km.fit, [[0., 1.]])
|
|
|
|
|
|
def test_k_means_plus_plus_init():
|
|
km = KMeans(init="k-means++", n_clusters=n_clusters,
|
|
random_state=42).fit(X)
|
|
_check_fitted_model(km)
|
|
|
|
|
|
def test_k_means_new_centers():
|
|
# Explore the part of the code where a new center is reassigned
|
|
X = np.array([[0, 0, 1, 1],
|
|
[0, 0, 0, 0],
|
|
[0, 1, 0, 0],
|
|
[0, 0, 0, 0],
|
|
[0, 0, 0, 0],
|
|
[0, 1, 0, 0]])
|
|
labels = [0, 1, 2, 1, 1, 2]
|
|
bad_centers = np.array([[+0, 1, 0, 0],
|
|
[.2, 0, .2, .2],
|
|
[+0, 0, 0, 0]])
|
|
|
|
km = KMeans(n_clusters=3, init=bad_centers, n_init=1, max_iter=10,
|
|
random_state=1)
|
|
for this_X in (X, sp.coo_matrix(X)):
|
|
km.fit(this_X)
|
|
this_labels = km.labels_
|
|
# Reorder the labels so that the first instance is in cluster 0,
|
|
# the second in cluster 1, ...
|
|
this_labels = np.unique(this_labels, return_index=True)[1][this_labels]
|
|
np.testing.assert_array_equal(this_labels, labels)
|
|
|
|
|
|
@if_safe_multiprocessing_with_blas
|
|
def test_k_means_plus_plus_init_2_jobs():
|
|
if sys.version_info[:2] < (3, 4):
|
|
raise SkipTest(
|
|
"Possible multi-process bug with some BLAS under Python < 3.4")
|
|
|
|
km = KMeans(init="k-means++", n_clusters=n_clusters, n_jobs=2,
|
|
random_state=42).fit(X)
|
|
_check_fitted_model(km)
|
|
|
|
|
|
def test_k_means_precompute_distances_flag():
|
|
# check that a warning is raised if the precompute_distances flag is not
|
|
# supported
|
|
km = KMeans(precompute_distances="wrong")
|
|
assert_raises(ValueError, km.fit, X)
|
|
|
|
|
|
def test_k_means_plus_plus_init_sparse():
|
|
km = KMeans(init="k-means++", n_clusters=n_clusters, random_state=42)
|
|
km.fit(X_csr)
|
|
_check_fitted_model(km)
|
|
|
|
|
|
def test_k_means_random_init():
|
|
km = KMeans(init="random", n_clusters=n_clusters, random_state=42)
|
|
km.fit(X)
|
|
_check_fitted_model(km)
|
|
|
|
|
|
def test_k_means_random_init_sparse():
|
|
km = KMeans(init="random", n_clusters=n_clusters, random_state=42)
|
|
km.fit(X_csr)
|
|
_check_fitted_model(km)
|
|
|
|
|
|
def test_k_means_plus_plus_init_not_precomputed():
|
|
km = KMeans(init="k-means++", n_clusters=n_clusters, random_state=42,
|
|
precompute_distances=False).fit(X)
|
|
_check_fitted_model(km)
|
|
|
|
|
|
def test_k_means_random_init_not_precomputed():
|
|
km = KMeans(init="random", n_clusters=n_clusters, random_state=42,
|
|
precompute_distances=False).fit(X)
|
|
_check_fitted_model(km)
|
|
|
|
|
|
def test_k_means_perfect_init():
|
|
km = KMeans(init=centers.copy(), n_clusters=n_clusters, random_state=42,
|
|
n_init=1)
|
|
km.fit(X)
|
|
_check_fitted_model(km)
|
|
|
|
|
|
def test_k_means_n_init():
|
|
rnd = np.random.RandomState(0)
|
|
X = rnd.normal(size=(40, 2))
|
|
|
|
# two regression tests on bad n_init argument
|
|
# previous bug: n_init <= 0 threw non-informative TypeError (#3858)
|
|
assert_raises_regex(ValueError, "n_init", KMeans(n_init=0).fit, X)
|
|
assert_raises_regex(ValueError, "n_init", KMeans(n_init=-1).fit, X)
|
|
|
|
|
|
def test_k_means_explicit_init_shape():
|
|
# test for sensible errors when giving explicit init
|
|
# with wrong number of features or clusters
|
|
rnd = np.random.RandomState(0)
|
|
X = rnd.normal(size=(40, 3))
|
|
for Class in [KMeans, MiniBatchKMeans]:
|
|
# mismatch of number of features
|
|
km = Class(n_init=1, init=X[:, :2], n_clusters=len(X))
|
|
msg = "does not match the number of features of the data"
|
|
assert_raises_regex(ValueError, msg, km.fit, X)
|
|
# for callable init
|
|
km = Class(n_init=1,
|
|
init=lambda X_, k, random_state: X_[:, :2],
|
|
n_clusters=len(X))
|
|
assert_raises_regex(ValueError, msg, km.fit, X)
|
|
# mismatch of number of clusters
|
|
msg = "does not match the number of clusters"
|
|
km = Class(n_init=1, init=X[:2, :], n_clusters=3)
|
|
assert_raises_regex(ValueError, msg, km.fit, X)
|
|
# for callable init
|
|
km = Class(n_init=1,
|
|
init=lambda X_, k, random_state: X_[:2, :],
|
|
n_clusters=3)
|
|
assert_raises_regex(ValueError, msg, km.fit, X)
|
|
|
|
|
|
def test_k_means_fortran_aligned_data():
|
|
# Check the KMeans will work well, even if X is a fortran-aligned data.
|
|
X = np.asfortranarray([[0, 0], [0, 1], [0, 1]])
|
|
centers = np.array([[0, 0], [0, 1]])
|
|
labels = np.array([0, 1, 1])
|
|
km = KMeans(n_init=1, init=centers, precompute_distances=False,
|
|
random_state=42, n_clusters=2)
|
|
km.fit(X)
|
|
assert_array_equal(km.cluster_centers_, centers)
|
|
assert_array_equal(km.labels_, labels)
|
|
|
|
|
|
def test_mb_k_means_plus_plus_init_dense_array():
|
|
mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters,
|
|
random_state=42)
|
|
mb_k_means.fit(X)
|
|
_check_fitted_model(mb_k_means)
|
|
|
|
|
|
def test_mb_kmeans_verbose():
|
|
mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters,
|
|
random_state=42, verbose=1)
|
|
old_stdout = sys.stdout
|
|
sys.stdout = StringIO()
|
|
try:
|
|
mb_k_means.fit(X)
|
|
finally:
|
|
sys.stdout = old_stdout
|
|
|
|
|
|
def test_mb_k_means_plus_plus_init_sparse_matrix():
|
|
mb_k_means = MiniBatchKMeans(init="k-means++", n_clusters=n_clusters,
|
|
random_state=42)
|
|
mb_k_means.fit(X_csr)
|
|
_check_fitted_model(mb_k_means)
|
|
|
|
|
|
def test_minibatch_init_with_large_k():
|
|
mb_k_means = MiniBatchKMeans(init='k-means++', init_size=10, n_clusters=20)
|
|
# Check that a warning is raised, as the number clusters is larger
|
|
# than the init_size
|
|
assert_warns(RuntimeWarning, mb_k_means.fit, X)
|
|
|
|
|
|
def test_minibatch_k_means_random_init_dense_array():
|
|
# increase n_init to make random init stable enough
|
|
mb_k_means = MiniBatchKMeans(init="random", n_clusters=n_clusters,
|
|
random_state=42, n_init=10).fit(X)
|
|
_check_fitted_model(mb_k_means)
|
|
|
|
|
|
def test_minibatch_k_means_random_init_sparse_csr():
|
|
# increase n_init to make random init stable enough
|
|
mb_k_means = MiniBatchKMeans(init="random", n_clusters=n_clusters,
|
|
random_state=42, n_init=10).fit(X_csr)
|
|
_check_fitted_model(mb_k_means)
|
|
|
|
|
|
def test_minibatch_k_means_perfect_init_dense_array():
|
|
mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
|
|
random_state=42, n_init=1).fit(X)
|
|
_check_fitted_model(mb_k_means)
|
|
|
|
|
|
def test_minibatch_k_means_init_multiple_runs_with_explicit_centers():
|
|
mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
|
|
random_state=42, n_init=10)
|
|
assert_warns(RuntimeWarning, mb_k_means.fit, X)
|
|
|
|
|
|
def test_minibatch_k_means_perfect_init_sparse_csr():
|
|
mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
|
|
random_state=42, n_init=1).fit(X_csr)
|
|
_check_fitted_model(mb_k_means)
|
|
|
|
|
|
def test_minibatch_sensible_reassign_fit():
|
|
# check if identical initial clusters are reassigned
|
|
# also a regression test for when there are more desired reassignments than
|
|
# samples.
|
|
zeroed_X, true_labels = make_blobs(n_samples=100, centers=5,
|
|
cluster_std=1., random_state=42)
|
|
zeroed_X[::2, :] = 0
|
|
mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=10, random_state=42,
|
|
init="random")
|
|
mb_k_means.fit(zeroed_X)
|
|
# there should not be too many exact zero cluster centers
|
|
assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
|
|
|
|
# do the same with batch-size > X.shape[0] (regression test)
|
|
mb_k_means = MiniBatchKMeans(n_clusters=20, batch_size=201,
|
|
random_state=42, init="random")
|
|
mb_k_means.fit(zeroed_X)
|
|
# there should not be too many exact zero cluster centers
|
|
assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
|
|
|
|
|
|
def test_minibatch_sensible_reassign_partial_fit():
|
|
zeroed_X, true_labels = make_blobs(n_samples=n_samples, centers=5,
|
|
cluster_std=1., random_state=42)
|
|
zeroed_X[::2, :] = 0
|
|
mb_k_means = MiniBatchKMeans(n_clusters=20, random_state=42, init="random")
|
|
for i in range(100):
|
|
mb_k_means.partial_fit(zeroed_X)
|
|
# there should not be too many exact zero cluster centers
|
|
assert_greater(mb_k_means.cluster_centers_.any(axis=1).sum(), 10)
|
|
|
|
|
|
def test_minibatch_reassign():
|
|
# Give a perfect initialization, but a large reassignment_ratio,
|
|
# as a result all the centers should be reassigned and the model
|
|
# should no longer be good
|
|
for this_X in (X, X_csr):
|
|
mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100,
|
|
random_state=42)
|
|
mb_k_means.fit(this_X)
|
|
|
|
score_before = mb_k_means.score(this_X)
|
|
try:
|
|
old_stdout = sys.stdout
|
|
sys.stdout = StringIO()
|
|
# Turn on verbosity to smoke test the display code
|
|
_mini_batch_step(this_X, (X ** 2).sum(axis=1),
|
|
mb_k_means.cluster_centers_,
|
|
mb_k_means.counts_,
|
|
np.zeros(X.shape[1], np.double),
|
|
False, distances=np.zeros(X.shape[0]),
|
|
random_reassign=True, random_state=42,
|
|
reassignment_ratio=1, verbose=True)
|
|
finally:
|
|
sys.stdout = old_stdout
|
|
assert_greater(score_before, mb_k_means.score(this_X))
|
|
|
|
# Give a perfect initialization, with a small reassignment_ratio,
|
|
# no center should be reassigned
|
|
for this_X in (X, X_csr):
|
|
mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100,
|
|
init=centers.copy(),
|
|
random_state=42, n_init=1)
|
|
mb_k_means.fit(this_X)
|
|
clusters_before = mb_k_means.cluster_centers_
|
|
# Turn on verbosity to smoke test the display code
|
|
_mini_batch_step(this_X, (X ** 2).sum(axis=1),
|
|
mb_k_means.cluster_centers_,
|
|
mb_k_means.counts_,
|
|
np.zeros(X.shape[1], np.double),
|
|
False, distances=np.zeros(X.shape[0]),
|
|
random_reassign=True, random_state=42,
|
|
reassignment_ratio=1e-15)
|
|
assert_array_almost_equal(clusters_before, mb_k_means.cluster_centers_)
|
|
|
|
|
|
def test_minibatch_with_many_reassignments():
|
|
# Test for the case that the number of clusters to reassign is bigger
|
|
# than the batch_size
|
|
n_samples = 550
|
|
rnd = np.random.RandomState(42)
|
|
X = rnd.uniform(size=(n_samples, 10))
|
|
# Check that the fit works if n_clusters is bigger than the batch_size.
|
|
# Run the test with 550 clusters and 550 samples, because it turned out
|
|
# that this values ensure that the number of clusters to reassign
|
|
# is always bigger than the batch_size
|
|
n_clusters = 550
|
|
MiniBatchKMeans(n_clusters=n_clusters,
|
|
batch_size=100,
|
|
init_size=n_samples,
|
|
random_state=42).fit(X)
|
|
|
|
|
|
def test_sparse_mb_k_means_callable_init():
|
|
|
|
def test_init(X, k, random_state):
|
|
return centers
|
|
|
|
# Small test to check that giving the wrong number of centers
|
|
# raises a meaningful error
|
|
msg = "does not match the number of clusters"
|
|
assert_raises_regex(ValueError, msg, MiniBatchKMeans(init=test_init,
|
|
random_state=42).fit,
|
|
X_csr)
|
|
|
|
# Now check that the fit actually works
|
|
mb_k_means = MiniBatchKMeans(n_clusters=3, init=test_init,
|
|
random_state=42).fit(X_csr)
|
|
_check_fitted_model(mb_k_means)
|
|
|
|
|
|
def test_mini_batch_k_means_random_init_partial_fit():
|
|
km = MiniBatchKMeans(n_clusters=n_clusters, init="random", random_state=42)
|
|
|
|
# use the partial_fit API for online learning
|
|
for X_minibatch in np.array_split(X, 10):
|
|
km.partial_fit(X_minibatch)
|
|
|
|
# compute the labeling on the complete dataset
|
|
labels = km.predict(X)
|
|
assert_equal(v_measure_score(true_labels, labels), 1.0)
|
|
|
|
|
|
def test_minibatch_default_init_size():
|
|
mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
|
|
batch_size=10, random_state=42,
|
|
n_init=1).fit(X)
|
|
assert_equal(mb_k_means.init_size_, 3 * mb_k_means.batch_size)
|
|
_check_fitted_model(mb_k_means)
|
|
|
|
|
|
def test_minibatch_tol():
|
|
mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=10,
|
|
random_state=42, tol=.01).fit(X)
|
|
_check_fitted_model(mb_k_means)
|
|
|
|
|
|
def test_minibatch_set_init_size():
|
|
mb_k_means = MiniBatchKMeans(init=centers.copy(), n_clusters=n_clusters,
|
|
init_size=666, random_state=42,
|
|
n_init=1).fit(X)
|
|
assert_equal(mb_k_means.init_size, 666)
|
|
assert_equal(mb_k_means.init_size_, n_samples)
|
|
_check_fitted_model(mb_k_means)
|
|
|
|
|
|
def test_k_means_invalid_init():
|
|
km = KMeans(init="invalid", n_init=1, n_clusters=n_clusters)
|
|
assert_raises(ValueError, km.fit, X)
|
|
|
|
|
|
def test_mini_match_k_means_invalid_init():
|
|
km = MiniBatchKMeans(init="invalid", n_init=1, n_clusters=n_clusters)
|
|
assert_raises(ValueError, km.fit, X)
|
|
|
|
|
|
def test_k_means_copyx():
|
|
# Check if copy_x=False returns nearly equal X after de-centering.
|
|
my_X = X.copy()
|
|
km = KMeans(copy_x=False, n_clusters=n_clusters, random_state=42)
|
|
km.fit(my_X)
|
|
_check_fitted_model(km)
|
|
|
|
# check if my_X is centered
|
|
assert_array_almost_equal(my_X, X)
|
|
|
|
|
|
def test_k_means_non_collapsed():
|
|
# Check k_means with a bad initialization does not yield a singleton
|
|
# Starting with bad centers that are quickly ignored should not
|
|
# result in a repositioning of the centers to the center of mass that
|
|
# would lead to collapsed centers which in turns make the clustering
|
|
# dependent of the numerical unstabilities.
|
|
my_X = np.array([[1.1, 1.1], [0.9, 1.1], [1.1, 0.9], [0.9, 1.1]])
|
|
array_init = np.array([[1.0, 1.0], [5.0, 5.0], [-5.0, -5.0]])
|
|
km = KMeans(init=array_init, n_clusters=3, random_state=42, n_init=1)
|
|
km.fit(my_X)
|
|
|
|
# centers must not been collapsed
|
|
assert_equal(len(np.unique(km.labels_)), 3)
|
|
|
|
centers = km.cluster_centers_
|
|
assert_true(np.linalg.norm(centers[0] - centers[1]) >= 0.1)
|
|
assert_true(np.linalg.norm(centers[0] - centers[2]) >= 0.1)
|
|
assert_true(np.linalg.norm(centers[1] - centers[2]) >= 0.1)
|
|
|
|
|
|
def test_predict():
|
|
km = KMeans(n_clusters=n_clusters, random_state=42)
|
|
|
|
km.fit(X)
|
|
|
|
# sanity check: predict centroid labels
|
|
pred = km.predict(km.cluster_centers_)
|
|
assert_array_equal(pred, np.arange(n_clusters))
|
|
|
|
# sanity check: re-predict labeling for training set samples
|
|
pred = km.predict(X)
|
|
assert_array_equal(pred, km.labels_)
|
|
|
|
# re-predict labels for training set using fit_predict
|
|
pred = km.fit_predict(X)
|
|
assert_array_equal(pred, km.labels_)
|
|
|
|
|
|
def test_score():
|
|
|
|
km1 = KMeans(n_clusters=n_clusters, max_iter=1, random_state=42, n_init=1)
|
|
s1 = km1.fit(X).score(X)
|
|
km2 = KMeans(n_clusters=n_clusters, max_iter=10, random_state=42, n_init=1)
|
|
s2 = km2.fit(X).score(X)
|
|
assert_greater(s2, s1)
|
|
|
|
km1 = KMeans(n_clusters=n_clusters, max_iter=1, random_state=42, n_init=1,
|
|
algorithm='elkan')
|
|
s1 = km1.fit(X).score(X)
|
|
km2 = KMeans(n_clusters=n_clusters, max_iter=10, random_state=42, n_init=1,
|
|
algorithm='elkan')
|
|
s2 = km2.fit(X).score(X)
|
|
assert_greater(s2, s1)
|
|
|
|
|
|
def test_predict_minibatch_dense_input():
|
|
mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, random_state=40).fit(X)
|
|
|
|
# sanity check: predict centroid labels
|
|
pred = mb_k_means.predict(mb_k_means.cluster_centers_)
|
|
assert_array_equal(pred, np.arange(n_clusters))
|
|
|
|
# sanity check: re-predict labeling for training set samples
|
|
pred = mb_k_means.predict(X)
|
|
assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_)
|
|
|
|
|
|
def test_predict_minibatch_kmeanspp_init_sparse_input():
|
|
mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++',
|
|
n_init=10).fit(X_csr)
|
|
|
|
# sanity check: re-predict labeling for training set samples
|
|
assert_array_equal(mb_k_means.predict(X_csr), mb_k_means.labels_)
|
|
|
|
# sanity check: predict centroid labels
|
|
pred = mb_k_means.predict(mb_k_means.cluster_centers_)
|
|
assert_array_equal(pred, np.arange(n_clusters))
|
|
|
|
# check that models trained on sparse input also works for dense input at
|
|
# predict time
|
|
assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_)
|
|
|
|
|
|
def test_predict_minibatch_random_init_sparse_input():
|
|
mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, init='random',
|
|
n_init=10).fit(X_csr)
|
|
|
|
# sanity check: re-predict labeling for training set samples
|
|
assert_array_equal(mb_k_means.predict(X_csr), mb_k_means.labels_)
|
|
|
|
# sanity check: predict centroid labels
|
|
pred = mb_k_means.predict(mb_k_means.cluster_centers_)
|
|
assert_array_equal(pred, np.arange(n_clusters))
|
|
|
|
# check that models trained on sparse input also works for dense input at
|
|
# predict time
|
|
assert_array_equal(mb_k_means.predict(X), mb_k_means.labels_)
|
|
|
|
|
|
def test_int_input():
|
|
X_list = [[0, 0], [10, 10], [12, 9], [-1, 1], [2, 0], [8, 10]]
|
|
for dtype in [np.int32, np.int64]:
|
|
X_int = np.array(X_list, dtype=dtype)
|
|
X_int_csr = sp.csr_matrix(X_int)
|
|
init_int = X_int[:2]
|
|
|
|
fitted_models = [
|
|
KMeans(n_clusters=2).fit(X_int),
|
|
KMeans(n_clusters=2, init=init_int, n_init=1).fit(X_int),
|
|
# mini batch kmeans is very unstable on such a small dataset hence
|
|
# we use many inits
|
|
MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int),
|
|
MiniBatchKMeans(n_clusters=2, n_init=10, batch_size=2).fit(X_int_csr),
|
|
MiniBatchKMeans(n_clusters=2, batch_size=2,
|
|
init=init_int, n_init=1).fit(X_int),
|
|
MiniBatchKMeans(n_clusters=2, batch_size=2,
|
|
init=init_int, n_init=1).fit(X_int_csr),
|
|
]
|
|
|
|
for km in fitted_models:
|
|
assert_equal(km.cluster_centers_.dtype, np.float64)
|
|
|
|
expected_labels = [0, 1, 1, 0, 0, 1]
|
|
scores = np.array([v_measure_score(expected_labels, km.labels_)
|
|
for km in fitted_models])
|
|
assert_array_equal(scores, np.ones(scores.shape[0]))
|
|
|
|
|
|
def test_transform():
|
|
km = KMeans(n_clusters=n_clusters)
|
|
km.fit(X)
|
|
X_new = km.transform(km.cluster_centers_)
|
|
|
|
for c in range(n_clusters):
|
|
assert_equal(X_new[c, c], 0)
|
|
for c2 in range(n_clusters):
|
|
if c != c2:
|
|
assert_greater(X_new[c, c2], 0)
|
|
|
|
|
|
def test_fit_transform():
|
|
X1 = KMeans(n_clusters=3, random_state=51).fit(X).transform(X)
|
|
X2 = KMeans(n_clusters=3, random_state=51).fit_transform(X)
|
|
assert_array_equal(X1, X2)
|
|
|
|
|
|
def test_predict_equal_labels():
|
|
km = KMeans(random_state=13, n_jobs=1, n_init=1, max_iter=1,
|
|
algorithm='full')
|
|
km.fit(X)
|
|
assert_array_equal(km.predict(X), km.labels_)
|
|
|
|
km = KMeans(random_state=13, n_jobs=1, n_init=1, max_iter=1,
|
|
algorithm='elkan')
|
|
km.fit(X)
|
|
assert_array_equal(km.predict(X), km.labels_)
|
|
|
|
|
|
def test_full_vs_elkan():
|
|
|
|
km1 = KMeans(algorithm='full', random_state=13)
|
|
km2 = KMeans(algorithm='elkan', random_state=13)
|
|
|
|
km1.fit(X)
|
|
km2.fit(X)
|
|
|
|
homogeneity_score(km1.predict(X), km2.predict(X)) == 1.0
|
|
|
|
|
|
def test_n_init():
|
|
# Check that increasing the number of init increases the quality
|
|
n_runs = 5
|
|
n_init_range = [1, 5, 10]
|
|
inertia = np.zeros((len(n_init_range), n_runs))
|
|
for i, n_init in enumerate(n_init_range):
|
|
for j in range(n_runs):
|
|
km = KMeans(n_clusters=n_clusters, init="random", n_init=n_init,
|
|
random_state=j).fit(X)
|
|
inertia[i, j] = km.inertia_
|
|
|
|
inertia = inertia.mean(axis=1)
|
|
failure_msg = ("Inertia %r should be decreasing"
|
|
" when n_init is increasing.") % list(inertia)
|
|
for i in range(len(n_init_range) - 1):
|
|
assert_true(inertia[i] >= inertia[i + 1], failure_msg)
|
|
|
|
|
|
def test_k_means_function():
|
|
# test calling the k_means function directly
|
|
# catch output
|
|
old_stdout = sys.stdout
|
|
sys.stdout = StringIO()
|
|
try:
|
|
cluster_centers, labels, inertia = k_means(X, n_clusters=n_clusters,
|
|
verbose=True)
|
|
finally:
|
|
sys.stdout = old_stdout
|
|
centers = cluster_centers
|
|
assert_equal(centers.shape, (n_clusters, n_features))
|
|
|
|
labels = labels
|
|
assert_equal(np.unique(labels).shape[0], n_clusters)
|
|
|
|
# check that the labels assignment are perfect (up to a permutation)
|
|
assert_equal(v_measure_score(true_labels, labels), 1.0)
|
|
assert_greater(inertia, 0.0)
|
|
|
|
# check warning when centers are passed
|
|
assert_warns(RuntimeWarning, k_means, X, n_clusters=n_clusters,
|
|
init=centers)
|
|
|
|
# to many clusters desired
|
|
assert_raises(ValueError, k_means, X, n_clusters=X.shape[0] + 1)
|
|
|
|
|
|
def test_x_squared_norms_init_centroids():
|
|
"""Test that x_squared_norms can be None in _init_centroids"""
|
|
from sklearn.cluster.k_means_ import _init_centroids
|
|
|
|
X_norms = np.sum(X**2, axis=1)
|
|
precompute = _init_centroids(
|
|
X, 3, "k-means++", random_state=0, x_squared_norms=X_norms)
|
|
assert_array_equal(
|
|
precompute,
|
|
_init_centroids(X, 3, "k-means++", random_state=0))
|
|
|
|
|
|
def test_max_iter_error():
|
|
|
|
km = KMeans(max_iter=-1)
|
|
assert_raise_message(ValueError, 'Number of iterations should be',
|
|
km.fit, X)
|
|
|
|
|
|
def test_float_precision():
|
|
km = KMeans(n_init=1, random_state=30)
|
|
mb_km = MiniBatchKMeans(n_init=1, random_state=30)
|
|
|
|
inertia = {}
|
|
X_new = {}
|
|
centers = {}
|
|
|
|
for estimator in [km, mb_km]:
|
|
for is_sparse in [False, True]:
|
|
for dtype in [np.float64, np.float32]:
|
|
if is_sparse:
|
|
X_test = sp.csr_matrix(X_csr, dtype=dtype)
|
|
else:
|
|
X_test = X.astype(dtype)
|
|
estimator.fit(X_test)
|
|
# dtype of cluster centers has to be the dtype of the input
|
|
# data
|
|
assert_equal(estimator.cluster_centers_.dtype, dtype)
|
|
inertia[dtype] = estimator.inertia_
|
|
X_new[dtype] = estimator.transform(X_test)
|
|
centers[dtype] = estimator.cluster_centers_
|
|
# ensure the extracted row is a 2d array
|
|
assert_equal(estimator.predict(X_test[:1]),
|
|
estimator.labels_[0])
|
|
if hasattr(estimator, 'partial_fit'):
|
|
estimator.partial_fit(X_test[0:3])
|
|
# dtype of cluster centers has to stay the same after
|
|
# partial_fit
|
|
assert_equal(estimator.cluster_centers_.dtype, dtype)
|
|
|
|
# compare arrays with low precision since the difference between
|
|
# 32 and 64 bit sometimes makes a difference up to the 4th decimal
|
|
# place
|
|
assert_array_almost_equal(inertia[np.float32], inertia[np.float64],
|
|
decimal=4)
|
|
assert_array_almost_equal(X_new[np.float32], X_new[np.float64],
|
|
decimal=4)
|
|
assert_array_almost_equal(centers[np.float32], centers[np.float64],
|
|
decimal=4)
|
|
|
|
|
|
def test_k_means_init_centers():
|
|
# This test is used to check KMeans won't mutate the user provided input
|
|
# array silently even if input data and init centers have the same type
|
|
X_small = np.array([[1.1, 1.1], [-7.5, -7.5], [-1.1, -1.1], [7.5, 7.5]])
|
|
init_centers = np.array([[0.0, 0.0], [5.0, 5.0], [-5.0, -5.0]])
|
|
for dtype in [np.int32, np.int64, np.float32, np.float64]:
|
|
X_test = dtype(X_small)
|
|
init_centers_test = dtype(init_centers)
|
|
assert_array_equal(init_centers, init_centers_test)
|
|
km = KMeans(init=init_centers_test, n_clusters=3, n_init=1)
|
|
km.fit(X_test)
|
|
assert_equal(False, np.may_share_memory(km.cluster_centers_, init_centers))
|
|
|
|
|
|
def test_sparse_k_means_init_centers():
|
|
from sklearn.datasets import load_iris
|
|
|
|
iris = load_iris()
|
|
X = iris.data
|
|
|
|
# Get a local optimum
|
|
centers = KMeans(n_clusters=3).fit(X).cluster_centers_
|
|
|
|
# Fit starting from a local optimum shouldn't change the solution
|
|
np.testing.assert_allclose(
|
|
centers,
|
|
KMeans(n_clusters=3,
|
|
init=centers,
|
|
n_init=1).fit(X).cluster_centers_
|
|
)
|
|
|
|
# The same should be true when X is sparse
|
|
X_sparse = sp.csr_matrix(X)
|
|
np.testing.assert_allclose(
|
|
centers,
|
|
KMeans(n_clusters=3,
|
|
init=centers,
|
|
n_init=1).fit(X_sparse).cluster_centers_
|
|
)
|
|
|
|
|
|
def test_sparse_validate_centers():
|
|
from sklearn.datasets import load_iris
|
|
|
|
iris = load_iris()
|
|
X = iris.data
|
|
|
|
# Get a local optimum
|
|
centers = KMeans(n_clusters=4).fit(X).cluster_centers_
|
|
|
|
# Test that a ValueError is raised for validate_center_shape
|
|
classifier = KMeans(n_clusters=3, init=centers, n_init=1)
|
|
|
|
msg = "The shape of the initial centers \(\(4L?, 4L?\)\) " \
|
|
"does not match the number of clusters 3"
|
|
assert_raises_regex(ValueError, msg, classifier.fit, X)
|