laywerrobot/lib/python3.6/site-packages/sklearn/neighbors/tests/test_kde.py

import numpy as np
from sklearn.utils.testing import (assert_allclose, assert_raises,
                                   assert_equal)
from sklearn.neighbors import KernelDensity, KDTree, NearestNeighbors
from sklearn.neighbors.ball_tree import kernel_norm
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_blobs
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler


def compute_kernel_slow(Y, X, kernel, h):
    d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))
    norm = kernel_norm(h, X.shape[1], kernel) / X.shape[0]

    if kernel == 'gaussian':
        return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)
    elif kernel == 'tophat':
        return norm * (d < h).sum(-1)
    elif kernel == 'epanechnikov':
        return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)
    elif kernel == 'exponential':
        return norm * (np.exp(-d / h)).sum(-1)
    elif kernel == 'linear':
        return norm * ((1 - d / h) * (d < h)).sum(-1)
    elif kernel == 'cosine':
        return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)
    else:
        raise ValueError('kernel not recognized')


def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true):
    kde = KernelDensity(kernel=kernel, bandwidth=bandwidth,
                        atol=atol, rtol=rtol)
    log_dens = kde.fit(X).score_samples(Y)
    assert_allclose(np.exp(log_dens), dens_true,
                    atol=atol, rtol=max(1E-7, rtol))
    assert_allclose(np.exp(kde.score(Y)),
                    np.prod(dens_true),
                    atol=atol, rtol=max(1E-7, rtol))


def test_kernel_density(n_samples=100, n_features=3):
    rng = np.random.RandomState(0)
    X = rng.randn(n_samples, n_features)
    Y = rng.randn(n_samples, n_features)

    for kernel in ['gaussian', 'tophat', 'epanechnikov',
                   'exponential', 'linear', 'cosine']:
        for bandwidth in [0.01, 0.1, 1]:
            dens_true = compute_kernel_slow(Y, X, kernel, bandwidth)

            for rtol in [0, 1E-5]:
                for atol in [1E-6, 1E-2]:
                    for breadth_first in (True, False):
                        yield (check_results, kernel, bandwidth, atol, rtol,
                               X, Y, dens_true)


def test_kernel_density_sampling(n_samples=100, n_features=3):
    rng = np.random.RandomState(0)
    X = rng.randn(n_samples, n_features)

    bandwidth = 0.2

    for kernel in ['gaussian', 'tophat']:
        # draw a tophat sample
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        samp = kde.sample(100)
        assert_equal(X.shape, samp.shape)

        # check that samples are in the right range
        nbrs = NearestNeighbors(n_neighbors=1).fit(X)
        dist, ind = nbrs.kneighbors(X, return_distance=True)

        if kernel == 'tophat':
            assert np.all(dist < bandwidth)
        elif kernel == 'gaussian':
            # 5 standard deviations is safe for 100 samples, but there's a
            # very small chance this test could fail.
            assert np.all(dist < 5 * bandwidth)

    # check unsupported kernels
    for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
        kde = KernelDensity(bandwidth, kernel=kernel).fit(X)
        assert_raises(NotImplementedError, kde.sample, 100)

    # non-regression test: used to return a scalar
    X = rng.randn(4, 1)
    kde = KernelDensity(kernel="gaussian").fit(X)
    assert_equal(kde.sample().shape, (1, 1))


def test_kde_algorithm_metric_choice():
    # Smoke test for various metrics and algorithms
    rng = np.random.RandomState(0)
    X = rng.randn(10, 2)    # 2 features required for haversine dist.
    Y = rng.randn(10, 2)

    for algorithm in ['auto', 'ball_tree', 'kd_tree']:
        for metric in ['euclidean', 'minkowski', 'manhattan',
                       'chebyshev', 'haversine']:
            if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics:
                assert_raises(ValueError, KernelDensity,
                              algorithm=algorithm, metric=metric)
            else:
                kde = KernelDensity(algorithm=algorithm, metric=metric)
                kde.fit(X)
                y_dens = kde.score_samples(Y)
                assert_equal(y_dens.shape, Y.shape[:1])


def test_kde_score(n_samples=100, n_features=3):
    pass
    # FIXME
    # rng = np.random.RandomState(0)
    # X = rng.random_sample((n_samples, n_features))
    # Y = rng.random_sample((n_samples, n_features))


def test_kde_badargs():
    assert_raises(ValueError, KernelDensity,
                  algorithm='blah')
    assert_raises(ValueError, KernelDensity,
                  bandwidth=0)
    assert_raises(ValueError, KernelDensity,
                  kernel='blah')
    assert_raises(ValueError, KernelDensity,
                  metric='blah')
    assert_raises(ValueError, KernelDensity,
                  algorithm='kd_tree', metric='blah')


def test_kde_pipeline_gridsearch():
    # test that kde plays nice in pipelines and grid-searches
    X, _ = make_blobs(cluster_std=.1, random_state=1,
                      centers=[[0, 1], [1, 0], [0, 0]])
    pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False),
                          KernelDensity(kernel="gaussian"))
    params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10])
    search = GridSearchCV(pipe1, param_grid=params, cv=5)
    search.fit(X)
    assert_equal(search.best_params_['kerneldensity__bandwidth'], .1)
first commit 2020-08-27 21:55:39 +02:00			`import numpy as np`
			`from sklearn.utils.testing import (assert_allclose, assert_raises,`
			`assert_equal)`
			`from sklearn.neighbors import KernelDensity, KDTree, NearestNeighbors`
			`from sklearn.neighbors.ball_tree import kernel_norm`
			`from sklearn.pipeline import make_pipeline`
			`from sklearn.datasets import make_blobs`
			`from sklearn.model_selection import GridSearchCV`
			`from sklearn.preprocessing import StandardScaler`


			`def compute_kernel_slow(Y, X, kernel, h):`
			`d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))`
			`norm = kernel_norm(h, X.shape[1], kernel) / X.shape[0]`

			`if kernel == 'gaussian':`
			`return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)`
			`elif kernel == 'tophat':`
			`return norm * (d < h).sum(-1)`
			`elif kernel == 'epanechnikov':`
			`return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)`
			`elif kernel == 'exponential':`
			`return norm * (np.exp(-d / h)).sum(-1)`
			`elif kernel == 'linear':`
			`return norm * ((1 - d / h) * (d < h)).sum(-1)`
			`elif kernel == 'cosine':`
			`return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)`
			`else:`
			`raise ValueError('kernel not recognized')`


			`def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true):`
			`kde = KernelDensity(kernel=kernel, bandwidth=bandwidth,`
			`atol=atol, rtol=rtol)`
			`log_dens = kde.fit(X).score_samples(Y)`
			`assert_allclose(np.exp(log_dens), dens_true,`
			`atol=atol, rtol=max(1E-7, rtol))`
			`assert_allclose(np.exp(kde.score(Y)),`
			`np.prod(dens_true),`
			`atol=atol, rtol=max(1E-7, rtol))`


			`def test_kernel_density(n_samples=100, n_features=3):`
			`rng = np.random.RandomState(0)`
			`X = rng.randn(n_samples, n_features)`
			`Y = rng.randn(n_samples, n_features)`

			`for kernel in ['gaussian', 'tophat', 'epanechnikov',`
			`'exponential', 'linear', 'cosine']:`
			`for bandwidth in [0.01, 0.1, 1]:`
			`dens_true = compute_kernel_slow(Y, X, kernel, bandwidth)`

			`for rtol in [0, 1E-5]:`
			`for atol in [1E-6, 1E-2]:`
			`for breadth_first in (True, False):`
			`yield (check_results, kernel, bandwidth, atol, rtol,`
			`X, Y, dens_true)`


			`def test_kernel_density_sampling(n_samples=100, n_features=3):`
			`rng = np.random.RandomState(0)`
			`X = rng.randn(n_samples, n_features)`

			`bandwidth = 0.2`

			`for kernel in ['gaussian', 'tophat']:`
			`# draw a tophat sample`
			`kde = KernelDensity(bandwidth, kernel=kernel).fit(X)`
			`samp = kde.sample(100)`
			`assert_equal(X.shape, samp.shape)`

			`# check that samples are in the right range`
			`nbrs = NearestNeighbors(n_neighbors=1).fit(X)`
			`dist, ind = nbrs.kneighbors(X, return_distance=True)`

			`if kernel == 'tophat':`
			`assert np.all(dist < bandwidth)`
			`elif kernel == 'gaussian':`
			`# 5 standard deviations is safe for 100 samples, but there's a`
			`# very small chance this test could fail.`
			`assert np.all(dist < 5 * bandwidth)`

			`# check unsupported kernels`
			`for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:`
			`kde = KernelDensity(bandwidth, kernel=kernel).fit(X)`
			`assert_raises(NotImplementedError, kde.sample, 100)`

			`# non-regression test: used to return a scalar`
			`X = rng.randn(4, 1)`
			`kde = KernelDensity(kernel="gaussian").fit(X)`
			`assert_equal(kde.sample().shape, (1, 1))`


			`def test_kde_algorithm_metric_choice():`
			`# Smoke test for various metrics and algorithms`
			`rng = np.random.RandomState(0)`
			`X = rng.randn(10, 2) # 2 features required for haversine dist.`
			`Y = rng.randn(10, 2)`

			`for algorithm in ['auto', 'ball_tree', 'kd_tree']:`
			`for metric in ['euclidean', 'minkowski', 'manhattan',`
			`'chebyshev', 'haversine']:`
			`if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics:`
			`assert_raises(ValueError, KernelDensity,`
			`algorithm=algorithm, metric=metric)`
			`else:`
			`kde = KernelDensity(algorithm=algorithm, metric=metric)`
			`kde.fit(X)`
			`y_dens = kde.score_samples(Y)`
			`assert_equal(y_dens.shape, Y.shape[:1])`


			`def test_kde_score(n_samples=100, n_features=3):`
			`pass`
			`# FIXME`
			`# rng = np.random.RandomState(0)`
			`# X = rng.random_sample((n_samples, n_features))`
			`# Y = rng.random_sample((n_samples, n_features))`


			`def test_kde_badargs():`
			`assert_raises(ValueError, KernelDensity,`
			`algorithm='blah')`
			`assert_raises(ValueError, KernelDensity,`
			`bandwidth=0)`
			`assert_raises(ValueError, KernelDensity,`
			`kernel='blah')`
			`assert_raises(ValueError, KernelDensity,`
			`metric='blah')`
			`assert_raises(ValueError, KernelDensity,`
			`algorithm='kd_tree', metric='blah')`


			`def test_kde_pipeline_gridsearch():`
			`# test that kde plays nice in pipelines and grid-searches`
			`X, _ = make_blobs(cluster_std=.1, random_state=1,`
			`centers=[[0, 1], [1, 0], [0, 0]])`
			`pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False),`
			`KernelDensity(kernel="gaussian"))`
			`params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10])`
			`search = GridSearchCV(pipe1, param_grid=params, cv=5)`
			`search.fit(X)`
			`assert_equal(search.best_params_['kerneldensity__bandwidth'], .1)`