144 lines
5.4 KiB
Python
144 lines
5.4 KiB
Python
|
import numpy as np
|
||
|
from sklearn.utils.testing import (assert_allclose, assert_raises,
|
||
|
assert_equal)
|
||
|
from sklearn.neighbors import KernelDensity, KDTree, NearestNeighbors
|
||
|
from sklearn.neighbors.ball_tree import kernel_norm
|
||
|
from sklearn.pipeline import make_pipeline
|
||
|
from sklearn.datasets import make_blobs
|
||
|
from sklearn.model_selection import GridSearchCV
|
||
|
from sklearn.preprocessing import StandardScaler
|
||
|
|
||
|
|
||
|
def compute_kernel_slow(Y, X, kernel, h):
|
||
|
d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))
|
||
|
norm = kernel_norm(h, X.shape[1], kernel) / X.shape[0]
|
||
|
|
||
|
if kernel == 'gaussian':
|
||
|
return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)
|
||
|
elif kernel == 'tophat':
|
||
|
return norm * (d < h).sum(-1)
|
||
|
elif kernel == 'epanechnikov':
|
||
|
return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)
|
||
|
elif kernel == 'exponential':
|
||
|
return norm * (np.exp(-d / h)).sum(-1)
|
||
|
elif kernel == 'linear':
|
||
|
return norm * ((1 - d / h) * (d < h)).sum(-1)
|
||
|
elif kernel == 'cosine':
|
||
|
return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)
|
||
|
else:
|
||
|
raise ValueError('kernel not recognized')
|
||
|
|
||
|
|
||
|
def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true):
|
||
|
kde = KernelDensity(kernel=kernel, bandwidth=bandwidth,
|
||
|
atol=atol, rtol=rtol)
|
||
|
log_dens = kde.fit(X).score_samples(Y)
|
||
|
assert_allclose(np.exp(log_dens), dens_true,
|
||
|
atol=atol, rtol=max(1E-7, rtol))
|
||
|
assert_allclose(np.exp(kde.score(Y)),
|
||
|
np.prod(dens_true),
|
||
|
atol=atol, rtol=max(1E-7, rtol))
|
||
|
|
||
|
|
||
|
def test_kernel_density(n_samples=100, n_features=3):
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.randn(n_samples, n_features)
|
||
|
Y = rng.randn(n_samples, n_features)
|
||
|
|
||
|
for kernel in ['gaussian', 'tophat', 'epanechnikov',
|
||
|
'exponential', 'linear', 'cosine']:
|
||
|
for bandwidth in [0.01, 0.1, 1]:
|
||
|
dens_true = compute_kernel_slow(Y, X, kernel, bandwidth)
|
||
|
|
||
|
for rtol in [0, 1E-5]:
|
||
|
for atol in [1E-6, 1E-2]:
|
||
|
for breadth_first in (True, False):
|
||
|
yield (check_results, kernel, bandwidth, atol, rtol,
|
||
|
X, Y, dens_true)
|
||
|
|
||
|
|
||
|
def test_kernel_density_sampling(n_samples=100, n_features=3):
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.randn(n_samples, n_features)
|
||
|
|
||
|
bandwidth = 0.2
|
||
|
|
||
|
for kernel in ['gaussian', 'tophat']:
|
||
|
# draw a tophat sample
|
||
|
kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
|
||
|
samp = kde.sample(100)
|
||
|
assert_equal(X.shape, samp.shape)
|
||
|
|
||
|
# check that samples are in the right range
|
||
|
nbrs = NearestNeighbors(n_neighbors=1).fit(X)
|
||
|
dist, ind = nbrs.kneighbors(X, return_distance=True)
|
||
|
|
||
|
if kernel == 'tophat':
|
||
|
assert np.all(dist < bandwidth)
|
||
|
elif kernel == 'gaussian':
|
||
|
# 5 standard deviations is safe for 100 samples, but there's a
|
||
|
# very small chance this test could fail.
|
||
|
assert np.all(dist < 5 * bandwidth)
|
||
|
|
||
|
# check unsupported kernels
|
||
|
for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
|
||
|
kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
|
||
|
assert_raises(NotImplementedError, kde.sample, 100)
|
||
|
|
||
|
# non-regression test: used to return a scalar
|
||
|
X = rng.randn(4, 1)
|
||
|
kde = KernelDensity(kernel="gaussian").fit(X)
|
||
|
assert_equal(kde.sample().shape, (1, 1))
|
||
|
|
||
|
|
||
|
def test_kde_algorithm_metric_choice():
|
||
|
# Smoke test for various metrics and algorithms
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.randn(10, 2) # 2 features required for haversine dist.
|
||
|
Y = rng.randn(10, 2)
|
||
|
|
||
|
for algorithm in ['auto', 'ball_tree', 'kd_tree']:
|
||
|
for metric in ['euclidean', 'minkowski', 'manhattan',
|
||
|
'chebyshev', 'haversine']:
|
||
|
if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics:
|
||
|
assert_raises(ValueError, KernelDensity,
|
||
|
algorithm=algorithm, metric=metric)
|
||
|
else:
|
||
|
kde = KernelDensity(algorithm=algorithm, metric=metric)
|
||
|
kde.fit(X)
|
||
|
y_dens = kde.score_samples(Y)
|
||
|
assert_equal(y_dens.shape, Y.shape[:1])
|
||
|
|
||
|
|
||
|
def test_kde_score(n_samples=100, n_features=3):
|
||
|
pass
|
||
|
# FIXME
|
||
|
# rng = np.random.RandomState(0)
|
||
|
# X = rng.random_sample((n_samples, n_features))
|
||
|
# Y = rng.random_sample((n_samples, n_features))
|
||
|
|
||
|
|
||
|
def test_kde_badargs():
|
||
|
assert_raises(ValueError, KernelDensity,
|
||
|
algorithm='blah')
|
||
|
assert_raises(ValueError, KernelDensity,
|
||
|
bandwidth=0)
|
||
|
assert_raises(ValueError, KernelDensity,
|
||
|
kernel='blah')
|
||
|
assert_raises(ValueError, KernelDensity,
|
||
|
metric='blah')
|
||
|
assert_raises(ValueError, KernelDensity,
|
||
|
algorithm='kd_tree', metric='blah')
|
||
|
|
||
|
|
||
|
def test_kde_pipeline_gridsearch():
|
||
|
# test that kde plays nice in pipelines and grid-searches
|
||
|
X, _ = make_blobs(cluster_std=.1, random_state=1,
|
||
|
centers=[[0, 1], [1, 0], [0, 0]])
|
||
|
pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False),
|
||
|
KernelDensity(kernel="gaussian"))
|
||
|
params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10])
|
||
|
search = GridSearchCV(pipe1, param_grid=params, cv=5)
|
||
|
search.fit(X)
|
||
|
assert_equal(search.best_params_['kerneldensity__bandwidth'], .1)
|