laywerrobot/lib/python3.6/site-packages/sklearn/neighbors/tests/test_approximate.py

498 lines
20 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
"""
Testing for the approximate neighbor search using
Locality Sensitive Hashing Forest module
(sklearn.neighbors.LSHForest).
"""
# Author: Maheshakya Wijewardena, Joel Nothman
import numpy as np
import scipy.sparse as sp
from sklearn.utils.testing import assert_array_equal
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_array_almost_equal
from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_array_less
from sklearn.utils.testing import assert_greater
from sklearn.utils.testing import assert_true
from sklearn.utils.testing import assert_not_equal
from sklearn.utils.testing import assert_warns_message
from sklearn.utils.testing import ignore_warnings
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.neighbors import LSHForest
from sklearn.neighbors import NearestNeighbors
def test_lsh_forest_deprecation():
assert_warns_message(DeprecationWarning,
"LSHForest has poor performance and has been "
"deprecated in 0.19. It will be removed "
"in version 0.21.", LSHForest)
def test_neighbors_accuracy_with_n_candidates():
# Checks whether accuracy increases as `n_candidates` increases.
n_candidates_values = np.array([.1, 50, 500])
n_samples = 100
n_features = 10
n_iter = 10
n_points = 5
rng = np.random.RandomState(42)
accuracies = np.zeros(n_candidates_values.shape[0], dtype=float)
X = rng.rand(n_samples, n_features)
for i, n_candidates in enumerate(n_candidates_values):
lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
n_candidates=n_candidates)
ignore_warnings(lshf.fit)(X)
for j in range(n_iter):
query = X[rng.randint(0, n_samples)].reshape(1, -1)
neighbors = lshf.kneighbors(query, n_neighbors=n_points,
return_distance=False)
distances = pairwise_distances(query, X, metric='cosine')
ranks = np.argsort(distances)[0, :n_points]
intersection = np.intersect1d(ranks, neighbors).shape[0]
ratio = intersection / float(n_points)
accuracies[i] = accuracies[i] + ratio
accuracies[i] = accuracies[i] / float(n_iter)
# Sorted accuracies should be equal to original accuracies
print('accuracies:', accuracies)
assert_true(np.all(np.diff(accuracies) >= 0),
msg="Accuracies are not non-decreasing.")
# Highest accuracy should be strictly greater than the lowest
assert_true(np.ptp(accuracies) > 0,
msg="Highest accuracy is not strictly greater than lowest.")
def test_neighbors_accuracy_with_n_estimators():
# Checks whether accuracy increases as `n_estimators` increases.
n_estimators = np.array([1, 10, 100])
n_samples = 100
n_features = 10
n_iter = 10
n_points = 5
rng = np.random.RandomState(42)
accuracies = np.zeros(n_estimators.shape[0], dtype=float)
X = rng.rand(n_samples, n_features)
for i, t in enumerate(n_estimators):
lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
n_candidates=500, n_estimators=t)
ignore_warnings(lshf.fit)(X)
for j in range(n_iter):
query = X[rng.randint(0, n_samples)].reshape(1, -1)
neighbors = lshf.kneighbors(query, n_neighbors=n_points,
return_distance=False)
distances = pairwise_distances(query, X, metric='cosine')
ranks = np.argsort(distances)[0, :n_points]
intersection = np.intersect1d(ranks, neighbors).shape[0]
ratio = intersection / float(n_points)
accuracies[i] = accuracies[i] + ratio
accuracies[i] = accuracies[i] / float(n_iter)
# Sorted accuracies should be equal to original accuracies
assert_true(np.all(np.diff(accuracies) >= 0),
msg="Accuracies are not non-decreasing.")
# Highest accuracy should be strictly greater than the lowest
assert_true(np.ptp(accuracies) > 0,
msg="Highest accuracy is not strictly greater than lowest.")
@ignore_warnings
def test_kneighbors():
# Checks whether desired number of neighbors are returned.
# It is guaranteed to return the requested number of neighbors
# if `min_hash_match` is set to 0. Returned distances should be
# in ascending order.
n_samples = 12
n_features = 2
n_iter = 10
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
min_hash_match=0)
# Test unfitted estimator
assert_raises(ValueError, lshf.kneighbors, X[0])
ignore_warnings(lshf.fit)(X)
for i in range(n_iter):
n_neighbors = rng.randint(0, n_samples)
query = X[rng.randint(0, n_samples)].reshape(1, -1)
neighbors = lshf.kneighbors(query, n_neighbors=n_neighbors,
return_distance=False)
# Desired number of neighbors should be returned.
assert_equal(neighbors.shape[1], n_neighbors)
# Multiple points
n_queries = 5
queries = X[rng.randint(0, n_samples, n_queries)]
distances, neighbors = lshf.kneighbors(queries,
n_neighbors=1,
return_distance=True)
assert_equal(neighbors.shape[0], n_queries)
assert_equal(distances.shape[0], n_queries)
# Test only neighbors
neighbors = lshf.kneighbors(queries, n_neighbors=1,
return_distance=False)
assert_equal(neighbors.shape[0], n_queries)
# Test random point(not in the data set)
query = rng.randn(n_features).reshape(1, -1)
lshf.kneighbors(query, n_neighbors=1,
return_distance=False)
# Test n_neighbors at initialization
neighbors = lshf.kneighbors(query, return_distance=False)
assert_equal(neighbors.shape[1], 5)
# Test `neighbors` has an integer dtype
assert_true(neighbors.dtype.kind == 'i',
msg="neighbors are not in integer dtype.")
def test_radius_neighbors():
# Checks whether Returned distances are less than `radius`
# At least one point should be returned when the `radius` is set
# to mean distance from the considering point to other points in
# the database.
# Moreover, this test compares the radius neighbors of LSHForest
# with the `sklearn.neighbors.NearestNeighbors`.
n_samples = 12
n_features = 2
n_iter = 10
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()
# Test unfitted estimator
assert_raises(ValueError, lshf.radius_neighbors, X[0])
ignore_warnings(lshf.fit)(X)
for i in range(n_iter):
# Select a random point in the dataset as the query
query = X[rng.randint(0, n_samples)].reshape(1, -1)
# At least one neighbor should be returned when the radius is the
# mean distance from the query to the points of the dataset.
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
neighbors = lshf.radius_neighbors(query, radius=mean_dist,
return_distance=False)
assert_equal(neighbors.shape, (1,))
assert_equal(neighbors.dtype, object)
assert_greater(neighbors[0].shape[0], 0)
# All distances to points in the results of the radius query should
# be less than mean_dist
distances, neighbors = lshf.radius_neighbors(query,
radius=mean_dist,
return_distance=True)
assert_array_less(distances[0], mean_dist)
# Multiple points
n_queries = 5
queries = X[rng.randint(0, n_samples, n_queries)]
distances, neighbors = lshf.radius_neighbors(queries,
return_distance=True)
# dists and inds should not be 1D arrays or arrays of variable lengths
# hence the use of the object dtype.
assert_equal(distances.shape, (n_queries,))
assert_equal(distances.dtype, object)
assert_equal(neighbors.shape, (n_queries,))
assert_equal(neighbors.dtype, object)
# Compare with exact neighbor search
query = X[rng.randint(0, n_samples)].reshape(1, -1)
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)
distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
# Radius-based queries do not sort the result points and the order
# depends on the method, the random_state and the dataset order. Therefore
# we need to sort the results ourselves before performing any comparison.
sorted_dists_exact = np.sort(distances_exact[0])
sorted_dists_approx = np.sort(distances_approx[0])
# Distances to exact neighbors are less than or equal to approximate
# counterparts as the approximate radius query might have missed some
# closer neighbors.
assert_true(np.all(np.less_equal(sorted_dists_exact,
sorted_dists_approx)))
@ignore_warnings
def test_radius_neighbors_boundary_handling():
X = [[0.999, 0.001], [0.5, 0.5], [0, 1.], [-1., 0.001]]
n_points = len(X)
# Build an exact nearest neighbors model as reference model to ensure
# consistency between exact and approximate methods
nnbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)
# Build a LSHForest model with hyperparameter values that always guarantee
# exact results on this toy dataset.
lsfh = ignore_warnings(LSHForest, category=DeprecationWarning)(
min_hash_match=0, n_candidates=n_points, random_state=42).fit(X)
# define a query aligned with the first axis
query = [[1., 0.]]
# Compute the exact cosine distances of the query to the four points of
# the dataset
dists = pairwise_distances(query, X, metric='cosine').ravel()
# The first point is almost aligned with the query (very small angle),
# the cosine distance should therefore be almost null:
assert_almost_equal(dists[0], 0, decimal=5)
# The second point form an angle of 45 degrees to the query vector
assert_almost_equal(dists[1], 1 - np.cos(np.pi / 4))
# The third point is orthogonal from the query vector hence at a distance
# exactly one:
assert_almost_equal(dists[2], 1)
# The last point is almost colinear but with opposite sign to the query
# therefore it has a cosine 'distance' very close to the maximum possible
# value of 2.
assert_almost_equal(dists[3], 2, decimal=5)
# If we query with a radius of one, all the samples except the last sample
# should be included in the results. This means that the third sample
# is lying on the boundary of the radius query:
exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1)
approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1)
assert_array_equal(np.sort(exact_idx[0]), [0, 1, 2])
assert_array_equal(np.sort(approx_idx[0]), [0, 1, 2])
assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-1])
assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-1])
# If we perform the same query with a slightly lower radius, the third
# point of the dataset that lay on the boundary of the previous query
# is now rejected:
eps = np.finfo(np.float64).eps
exact_dists, exact_idx = nnbrs.radius_neighbors(query, radius=1 - eps)
approx_dists, approx_idx = lsfh.radius_neighbors(query, radius=1 - eps)
assert_array_equal(np.sort(exact_idx[0]), [0, 1])
assert_array_equal(np.sort(approx_idx[0]), [0, 1])
assert_array_almost_equal(np.sort(exact_dists[0]), dists[:-2])
assert_array_almost_equal(np.sort(approx_dists[0]), dists[:-2])
def test_distances():
# Checks whether returned neighbors are from closest to farthest.
n_samples = 12
n_features = 2
n_iter = 10
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()
ignore_warnings(lshf.fit)(X)
for i in range(n_iter):
n_neighbors = rng.randint(0, n_samples)
query = X[rng.randint(0, n_samples)].reshape(1, -1)
distances, neighbors = lshf.kneighbors(query,
n_neighbors=n_neighbors,
return_distance=True)
# Returned neighbors should be from closest to farthest, that is
# increasing distance values.
assert_true(np.all(np.diff(distances[0]) >= 0))
# Note: the radius_neighbors method does not guarantee the order of
# the results.
def test_fit():
# Checks whether `fit` method sets all attribute values correctly.
n_samples = 12
n_features = 2
n_estimators = 5
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
n_estimators=n_estimators)
ignore_warnings(lshf.fit)(X)
# _input_array = X
assert_array_equal(X, lshf._fit_X)
# A hash function g(p) for each tree
assert_equal(n_estimators, len(lshf.hash_functions_))
# Hash length = 32
assert_equal(32, lshf.hash_functions_[0].components_.shape[0])
# Number of trees_ in the forest
assert_equal(n_estimators, len(lshf.trees_))
# Each tree has entries for every data point
assert_equal(n_samples, len(lshf.trees_[0]))
# Original indices after sorting the hashes
assert_equal(n_estimators, len(lshf.original_indices_))
# Each set of original indices in a tree has entries for every data point
assert_equal(n_samples, len(lshf.original_indices_[0]))
def test_partial_fit():
# Checks whether inserting array is consistent with fitted data.
# `partial_fit` method should set all attribute values correctly.
n_samples = 12
n_samples_partial_fit = 3
n_features = 2
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
X_partial_fit = rng.rand(n_samples_partial_fit, n_features)
lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()
# Test unfitted estimator
ignore_warnings(lshf.partial_fit)(X)
assert_array_equal(X, lshf._fit_X)
ignore_warnings(lshf.fit)(X)
# Insert wrong dimension
assert_raises(ValueError, lshf.partial_fit,
np.random.randn(n_samples_partial_fit, n_features - 1))
ignore_warnings(lshf.partial_fit)(X_partial_fit)
# size of _input_array = samples + 1 after insertion
assert_equal(lshf._fit_X.shape[0],
n_samples + n_samples_partial_fit)
# size of original_indices_[1] = samples + 1
assert_equal(len(lshf.original_indices_[0]),
n_samples + n_samples_partial_fit)
# size of trees_[1] = samples + 1
assert_equal(len(lshf.trees_[1]),
n_samples + n_samples_partial_fit)
def test_hash_functions():
# Checks randomness of hash functions.
# Variance and mean of each hash function (projection vector)
# should be different from flattened array of hash functions.
# If hash functions are not randomly built (seeded with
# same value), variances and means of all functions are equal.
n_samples = 12
n_features = 2
n_estimators = 5
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
n_estimators=n_estimators,
random_state=rng.randint(0, np.iinfo(np.int32).max))
ignore_warnings(lshf.fit)(X)
hash_functions = []
for i in range(n_estimators):
hash_functions.append(lshf.hash_functions_[i].components_)
for i in range(n_estimators):
assert_not_equal(np.var(hash_functions),
np.var(lshf.hash_functions_[i].components_))
for i in range(n_estimators):
assert_not_equal(np.mean(hash_functions),
np.mean(lshf.hash_functions_[i].components_))
def test_candidates():
# Checks whether candidates are sufficient.
# This should handle the cases when number of candidates is 0.
# User should be warned when number of candidates is less than
# requested number of neighbors.
X_train = np.array([[5, 5, 2], [21, 5, 5], [1, 1, 1], [8, 9, 1],
[6, 10, 2]], dtype=np.float32)
X_test = np.array([7, 10, 3], dtype=np.float32).reshape(1, -1)
# For zero candidates
lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
min_hash_match=32)
ignore_warnings(lshf.fit)(X_train)
message = ("Number of candidates is not sufficient to retrieve"
" %i neighbors with"
" min_hash_match = %i. Candidates are filled up"
" uniformly from unselected"
" indices." % (3, 32))
assert_warns_message(UserWarning, message, lshf.kneighbors,
X_test, n_neighbors=3)
distances, neighbors = lshf.kneighbors(X_test, n_neighbors=3)
assert_equal(distances.shape[1], 3)
# For candidates less than n_neighbors
lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
min_hash_match=31)
ignore_warnings(lshf.fit)(X_train)
message = ("Number of candidates is not sufficient to retrieve"
" %i neighbors with"
" min_hash_match = %i. Candidates are filled up"
" uniformly from unselected"
" indices." % (5, 31))
assert_warns_message(UserWarning, message, lshf.kneighbors,
X_test, n_neighbors=5)
distances, neighbors = lshf.kneighbors(X_test, n_neighbors=5)
assert_equal(distances.shape[1], 5)
def test_graphs():
# Smoke tests for graph methods.
n_samples_sizes = [5, 10, 20]
n_features = 3
rng = np.random.RandomState(42)
for n_samples in n_samples_sizes:
X = rng.rand(n_samples, n_features)
lshf = ignore_warnings(LSHForest, category=DeprecationWarning)(
min_hash_match=0)
ignore_warnings(lshf.fit)(X)
kneighbors_graph = lshf.kneighbors_graph(X)
radius_neighbors_graph = lshf.radius_neighbors_graph(X)
assert_equal(kneighbors_graph.shape[0], n_samples)
assert_equal(kneighbors_graph.shape[1], n_samples)
assert_equal(radius_neighbors_graph.shape[0], n_samples)
assert_equal(radius_neighbors_graph.shape[1], n_samples)
def test_sparse_input():
# note: Fixed random state in sp.rand is not supported in older scipy.
# The test should succeed regardless.
X1 = sp.rand(50, 100)
X2 = sp.rand(10, 100)
forest_sparse = ignore_warnings(LSHForest, category=DeprecationWarning)(
radius=1, random_state=0).fit(X1)
forest_dense = ignore_warnings(LSHForest, category=DeprecationWarning)(
radius=1, random_state=0).fit(X1.A)
d_sparse, i_sparse = forest_sparse.kneighbors(X2, return_distance=True)
d_dense, i_dense = forest_dense.kneighbors(X2.A, return_distance=True)
assert_almost_equal(d_sparse, d_dense)
assert_almost_equal(i_sparse, i_dense)
d_sparse, i_sparse = forest_sparse.radius_neighbors(X2,
return_distance=True)
d_dense, i_dense = forest_dense.radius_neighbors(X2.A,
return_distance=True)
assert_equal(d_sparse.shape, d_dense.shape)
for a, b in zip(d_sparse, d_dense):
assert_almost_equal(a, b)
for a, b in zip(i_sparse, i_dense):
assert_almost_equal(a, b)