1753 lines
65 KiB
Python
1753 lines
65 KiB
Python
|
from __future__ import print_function
|
||
|
|
||
|
import types
|
||
|
import warnings
|
||
|
import sys
|
||
|
import traceback
|
||
|
import pickle
|
||
|
from copy import deepcopy
|
||
|
import numpy as np
|
||
|
from scipy import sparse
|
||
|
from scipy.stats import rankdata
|
||
|
import struct
|
||
|
|
||
|
from sklearn.externals.six.moves import zip
|
||
|
from sklearn.externals.joblib import hash, Memory
|
||
|
from sklearn.utils.testing import assert_raises
|
||
|
from sklearn.utils.testing import assert_raises_regex
|
||
|
from sklearn.utils.testing import assert_raise_message
|
||
|
from sklearn.utils.testing import assert_equal
|
||
|
from sklearn.utils.testing import assert_not_equal
|
||
|
from sklearn.utils.testing import assert_true
|
||
|
from sklearn.utils.testing import assert_false
|
||
|
from sklearn.utils.testing import assert_in
|
||
|
from sklearn.utils.testing import assert_array_equal
|
||
|
from sklearn.utils.testing import assert_allclose
|
||
|
from sklearn.utils.testing import assert_allclose_dense_sparse
|
||
|
from sklearn.utils.testing import assert_warns_message
|
||
|
from sklearn.utils.testing import META_ESTIMATORS
|
||
|
from sklearn.utils.testing import set_random_state
|
||
|
from sklearn.utils.testing import assert_greater
|
||
|
from sklearn.utils.testing import assert_greater_equal
|
||
|
from sklearn.utils.testing import SkipTest
|
||
|
from sklearn.utils.testing import ignore_warnings
|
||
|
from sklearn.utils.testing import assert_dict_equal
|
||
|
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
|
||
|
|
||
|
|
||
|
from sklearn.base import (clone, TransformerMixin, ClusterMixin,
|
||
|
BaseEstimator, is_classifier, is_regressor)
|
||
|
from sklearn.metrics import accuracy_score, adjusted_rand_score, f1_score
|
||
|
|
||
|
from sklearn.random_projection import BaseRandomProjection
|
||
|
from sklearn.feature_selection import SelectKBest
|
||
|
from sklearn.svm.base import BaseLibSVM
|
||
|
from sklearn.linear_model.stochastic_gradient import BaseSGD
|
||
|
from sklearn.pipeline import make_pipeline
|
||
|
from sklearn.exceptions import ConvergenceWarning
|
||
|
from sklearn.exceptions import DataConversionWarning
|
||
|
from sklearn.exceptions import SkipTestWarning
|
||
|
from sklearn.model_selection import train_test_split
|
||
|
|
||
|
from sklearn.utils import shuffle
|
||
|
from sklearn.utils.fixes import signature
|
||
|
from sklearn.utils.validation import has_fit_parameter, _num_samples
|
||
|
from sklearn.preprocessing import StandardScaler
|
||
|
from sklearn.datasets import load_iris, load_boston, make_blobs
|
||
|
|
||
|
|
||
|
BOSTON = None
|
||
|
CROSS_DECOMPOSITION = ['PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']
|
||
|
MULTI_OUTPUT = ['CCA', 'DecisionTreeRegressor', 'ElasticNet',
|
||
|
'ExtraTreeRegressor', 'ExtraTreesRegressor', 'GaussianProcess',
|
||
|
'GaussianProcessRegressor',
|
||
|
'KNeighborsRegressor', 'KernelRidge', 'Lars', 'Lasso',
|
||
|
'LassoLars', 'LinearRegression', 'MultiTaskElasticNet',
|
||
|
'MultiTaskElasticNetCV', 'MultiTaskLasso', 'MultiTaskLassoCV',
|
||
|
'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSRegression',
|
||
|
'RANSACRegressor', 'RadiusNeighborsRegressor',
|
||
|
'RandomForestRegressor', 'Ridge', 'RidgeCV']
|
||
|
|
||
|
|
||
|
def _yield_non_meta_checks(name, estimator):
|
||
|
yield check_estimators_dtypes
|
||
|
yield check_fit_score_takes_y
|
||
|
yield check_dtype_object
|
||
|
yield check_sample_weights_pandas_series
|
||
|
yield check_sample_weights_list
|
||
|
yield check_estimators_fit_returns_self
|
||
|
|
||
|
# Check that all estimator yield informative messages when
|
||
|
# trained on empty datasets
|
||
|
yield check_estimators_empty_data_messages
|
||
|
|
||
|
if name not in CROSS_DECOMPOSITION + ['SpectralEmbedding']:
|
||
|
# SpectralEmbedding is non-deterministic,
|
||
|
# see issue #4236
|
||
|
# cross-decomposition's "transform" returns X and Y
|
||
|
yield check_pipeline_consistency
|
||
|
|
||
|
if name not in ['Imputer']:
|
||
|
# Test that all estimators check their input for NaN's and infs
|
||
|
yield check_estimators_nan_inf
|
||
|
|
||
|
if name not in ['GaussianProcess']:
|
||
|
# FIXME!
|
||
|
# in particular GaussianProcess!
|
||
|
yield check_estimators_overwrite_params
|
||
|
if hasattr(estimator, 'sparsify'):
|
||
|
yield check_sparsify_coefficients
|
||
|
|
||
|
yield check_estimator_sparse_data
|
||
|
|
||
|
# Test that estimators can be pickled, and once pickled
|
||
|
# give the same answer as before.
|
||
|
yield check_estimators_pickle
|
||
|
|
||
|
|
||
|
def _yield_classifier_checks(name, classifier):
|
||
|
# test classifiers can handle non-array data
|
||
|
yield check_classifier_data_not_an_array
|
||
|
# test classifiers trained on a single label always return this label
|
||
|
yield check_classifiers_one_label
|
||
|
yield check_classifiers_classes
|
||
|
yield check_estimators_partial_fit_n_features
|
||
|
# basic consistency testing
|
||
|
yield check_classifiers_train
|
||
|
yield check_classifiers_regression_target
|
||
|
if (name not in
|
||
|
["MultinomialNB", "LabelPropagation", "LabelSpreading"] and
|
||
|
# TODO some complication with -1 label
|
||
|
name not in ["DecisionTreeClassifier", "ExtraTreeClassifier"]):
|
||
|
# We don't raise a warning in these classifiers, as
|
||
|
# the column y interface is used by the forests.
|
||
|
|
||
|
yield check_supervised_y_2d
|
||
|
# test if NotFittedError is raised
|
||
|
yield check_estimators_unfitted
|
||
|
if 'class_weight' in classifier.get_params().keys():
|
||
|
yield check_class_weight_classifiers
|
||
|
|
||
|
yield check_non_transformer_estimators_n_iter
|
||
|
# test if predict_proba is a monotonic transformation of decision_function
|
||
|
yield check_decision_proba_consistency
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_supervised_y_no_nan(name, estimator_orig):
|
||
|
# Checks that the Estimator targets are not NaN.
|
||
|
estimator = clone(estimator_orig)
|
||
|
rng = np.random.RandomState(888)
|
||
|
X = rng.randn(10, 5)
|
||
|
y = np.ones(10) * np.inf
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
|
||
|
errmsg = "Input contains NaN, infinity or a value too large for " \
|
||
|
"dtype('float64')."
|
||
|
try:
|
||
|
estimator.fit(X, y)
|
||
|
except ValueError as e:
|
||
|
if str(e) != errmsg:
|
||
|
raise ValueError("Estimator {0} raised error as expected, but "
|
||
|
"does not match expected error message"
|
||
|
.format(name))
|
||
|
else:
|
||
|
raise ValueError("Estimator {0} should have raised error on fitting "
|
||
|
"array y with NaN value.".format(name))
|
||
|
|
||
|
|
||
|
def _yield_regressor_checks(name, regressor):
|
||
|
# TODO: test with intercept
|
||
|
# TODO: test with multiple responses
|
||
|
# basic testing
|
||
|
yield check_regressors_train
|
||
|
yield check_regressor_data_not_an_array
|
||
|
yield check_estimators_partial_fit_n_features
|
||
|
yield check_regressors_no_decision_function
|
||
|
yield check_supervised_y_2d
|
||
|
yield check_supervised_y_no_nan
|
||
|
if name != 'CCA':
|
||
|
# check that the regressor handles int input
|
||
|
yield check_regressors_int
|
||
|
if name != "GaussianProcessRegressor":
|
||
|
# Test if NotFittedError is raised
|
||
|
yield check_estimators_unfitted
|
||
|
yield check_non_transformer_estimators_n_iter
|
||
|
|
||
|
|
||
|
def _yield_transformer_checks(name, transformer):
|
||
|
# All transformers should either deal with sparse data or raise an
|
||
|
# exception with type TypeError and an intelligible error message
|
||
|
if name not in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer',
|
||
|
'PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']:
|
||
|
yield check_transformer_data_not_an_array
|
||
|
# these don't actually fit the data, so don't raise errors
|
||
|
if name not in ['AdditiveChi2Sampler', 'Binarizer',
|
||
|
'FunctionTransformer', 'Normalizer']:
|
||
|
# basic tests
|
||
|
yield check_transformer_general
|
||
|
yield check_transformers_unfitted
|
||
|
# Dependent on external solvers and hence accessing the iter
|
||
|
# param is non-trivial.
|
||
|
external_solver = ['Isomap', 'KernelPCA', 'LocallyLinearEmbedding',
|
||
|
'RandomizedLasso', 'LogisticRegressionCV']
|
||
|
if name not in external_solver:
|
||
|
yield check_transformer_n_iter
|
||
|
|
||
|
|
||
|
def _yield_clustering_checks(name, clusterer):
|
||
|
yield check_clusterer_compute_labels_predict
|
||
|
if name not in ('WardAgglomeration', "FeatureAgglomeration"):
|
||
|
# this is clustering on the features
|
||
|
# let's not test that here.
|
||
|
yield check_clustering
|
||
|
yield check_estimators_partial_fit_n_features
|
||
|
yield check_non_transformer_estimators_n_iter
|
||
|
|
||
|
|
||
|
def _yield_all_checks(name, estimator):
|
||
|
for check in _yield_non_meta_checks(name, estimator):
|
||
|
yield check
|
||
|
if is_classifier(estimator):
|
||
|
for check in _yield_classifier_checks(name, estimator):
|
||
|
yield check
|
||
|
if is_regressor(estimator):
|
||
|
for check in _yield_regressor_checks(name, estimator):
|
||
|
yield check
|
||
|
if isinstance(estimator, TransformerMixin):
|
||
|
for check in _yield_transformer_checks(name, estimator):
|
||
|
yield check
|
||
|
if isinstance(estimator, ClusterMixin):
|
||
|
for check in _yield_clustering_checks(name, estimator):
|
||
|
yield check
|
||
|
yield check_fit2d_predict1d
|
||
|
yield check_fit2d_1sample
|
||
|
yield check_fit2d_1feature
|
||
|
yield check_fit1d_1feature
|
||
|
yield check_fit1d_1sample
|
||
|
yield check_get_params_invariance
|
||
|
yield check_dict_unchanged
|
||
|
yield check_dont_overwrite_parameters
|
||
|
|
||
|
|
||
|
def check_estimator(Estimator):
|
||
|
"""Check if estimator adheres to scikit-learn conventions.
|
||
|
|
||
|
This estimator will run an extensive test-suite for input validation,
|
||
|
shapes, etc.
|
||
|
Additional tests for classifiers, regressors, clustering or transformers
|
||
|
will be run if the Estimator class inherits from the corresponding mixin
|
||
|
from sklearn.base.
|
||
|
|
||
|
This test can be applied to classes or instances.
|
||
|
Classes currently have some additional tests that related to construction,
|
||
|
while passing instances allows the testing of multiple options.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
estimator : estimator object or class
|
||
|
Estimator to check. Estimator is a class object or instance.
|
||
|
|
||
|
"""
|
||
|
if isinstance(Estimator, type):
|
||
|
# got a class
|
||
|
name = Estimator.__name__
|
||
|
check_parameters_default_constructible(name, Estimator)
|
||
|
check_no_fit_attributes_set_in_init(name, Estimator)
|
||
|
estimator = Estimator()
|
||
|
else:
|
||
|
# got an instance
|
||
|
estimator = Estimator
|
||
|
name = type(estimator).__name__
|
||
|
|
||
|
for check in _yield_all_checks(name, estimator):
|
||
|
try:
|
||
|
check(name, estimator)
|
||
|
except SkipTest as message:
|
||
|
# the only SkipTest thrown currently results from not
|
||
|
# being able to import pandas.
|
||
|
warnings.warn(message, SkipTestWarning)
|
||
|
|
||
|
|
||
|
def _boston_subset(n_samples=200):
|
||
|
global BOSTON
|
||
|
if BOSTON is None:
|
||
|
boston = load_boston()
|
||
|
X, y = boston.data, boston.target
|
||
|
X, y = shuffle(X, y, random_state=0)
|
||
|
X, y = X[:n_samples], y[:n_samples]
|
||
|
X = StandardScaler().fit_transform(X)
|
||
|
BOSTON = X, y
|
||
|
return BOSTON
|
||
|
|
||
|
|
||
|
def set_checking_parameters(estimator):
|
||
|
# set parameters to speed up some estimators and
|
||
|
# avoid deprecated behaviour
|
||
|
params = estimator.get_params()
|
||
|
if ("n_iter" in params and estimator.__class__.__name__ != "TSNE"
|
||
|
and not isinstance(estimator, BaseSGD)):
|
||
|
estimator.set_params(n_iter=5)
|
||
|
if "max_iter" in params:
|
||
|
warnings.simplefilter("ignore", ConvergenceWarning)
|
||
|
if estimator.max_iter is not None:
|
||
|
estimator.set_params(max_iter=min(5, estimator.max_iter))
|
||
|
# LinearSVR, LinearSVC
|
||
|
if estimator.__class__.__name__ in ['LinearSVR', 'LinearSVC']:
|
||
|
estimator.set_params(max_iter=20)
|
||
|
# NMF
|
||
|
if estimator.__class__.__name__ == 'NMF':
|
||
|
estimator.set_params(max_iter=100)
|
||
|
# MLP
|
||
|
if estimator.__class__.__name__ in ['MLPClassifier', 'MLPRegressor']:
|
||
|
estimator.set_params(max_iter=100)
|
||
|
if "n_resampling" in params:
|
||
|
# randomized lasso
|
||
|
estimator.set_params(n_resampling=5)
|
||
|
if "n_estimators" in params:
|
||
|
# especially gradient boosting with default 100
|
||
|
estimator.set_params(n_estimators=min(5, estimator.n_estimators))
|
||
|
if "max_trials" in params:
|
||
|
# RANSAC
|
||
|
estimator.set_params(max_trials=10)
|
||
|
if "n_init" in params:
|
||
|
# K-Means
|
||
|
estimator.set_params(n_init=2)
|
||
|
if "decision_function_shape" in params:
|
||
|
# SVC
|
||
|
estimator.set_params(decision_function_shape='ovo')
|
||
|
|
||
|
if estimator.__class__.__name__ == "SelectFdr":
|
||
|
# be tolerant of noisy datasets (not actually speed)
|
||
|
estimator.set_params(alpha=.5)
|
||
|
|
||
|
if estimator.__class__.__name__ == "TheilSenRegressor":
|
||
|
estimator.max_subpopulation = 100
|
||
|
|
||
|
if isinstance(estimator, BaseRandomProjection):
|
||
|
# Due to the jl lemma and often very few samples, the number
|
||
|
# of components of the random matrix projection will be probably
|
||
|
# greater than the number of features.
|
||
|
# So we impose a smaller number (avoid "auto" mode)
|
||
|
estimator.set_params(n_components=2)
|
||
|
|
||
|
if isinstance(estimator, SelectKBest):
|
||
|
# SelectKBest has a default of k=10
|
||
|
# which is more feature than we have in most case.
|
||
|
estimator.set_params(k=1)
|
||
|
|
||
|
|
||
|
class NotAnArray(object):
|
||
|
" An object that is convertable to an array"
|
||
|
|
||
|
def __init__(self, data):
|
||
|
self.data = data
|
||
|
|
||
|
def __array__(self, dtype=None):
|
||
|
return self.data
|
||
|
|
||
|
|
||
|
def _is_32bit():
|
||
|
"""Detect if process is 32bit Python."""
|
||
|
return struct.calcsize('P') * 8 == 32
|
||
|
|
||
|
|
||
|
def check_estimator_sparse_data(name, estimator_orig):
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.rand(40, 10)
|
||
|
X[X < .8] = 0
|
||
|
X_csr = sparse.csr_matrix(X)
|
||
|
y = (4 * rng.rand(40)).astype(np.int)
|
||
|
# catch deprecation warnings
|
||
|
with ignore_warnings(category=DeprecationWarning):
|
||
|
estimator = clone(estimator_orig)
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
for sparse_format in ['csr', 'csc', 'dok', 'lil', 'coo', 'dia', 'bsr']:
|
||
|
X = X_csr.asformat(sparse_format)
|
||
|
# catch deprecation warnings
|
||
|
with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
|
||
|
if name in ['Scaler', 'StandardScaler']:
|
||
|
estimator = clone(estimator).set_params(with_mean=False)
|
||
|
else:
|
||
|
estimator = clone(estimator)
|
||
|
# fit and predict
|
||
|
try:
|
||
|
with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
|
||
|
estimator.fit(X, y)
|
||
|
if hasattr(estimator, "predict"):
|
||
|
pred = estimator.predict(X)
|
||
|
assert_equal(pred.shape, (X.shape[0],))
|
||
|
if hasattr(estimator, 'predict_proba'):
|
||
|
probs = estimator.predict_proba(X)
|
||
|
assert_equal(probs.shape, (X.shape[0], 4))
|
||
|
except TypeError as e:
|
||
|
if 'sparse' not in repr(e):
|
||
|
print("Estimator %s doesn't seem to fail gracefully on "
|
||
|
"sparse data: error message state explicitly that "
|
||
|
"sparse input is not supported if this is not the case."
|
||
|
% name)
|
||
|
raise
|
||
|
except Exception:
|
||
|
print("Estimator %s doesn't seem to fail gracefully on "
|
||
|
"sparse data: it should raise a TypeError if sparse input "
|
||
|
"is explicitly not supported." % name)
|
||
|
raise
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_sample_weights_pandas_series(name, estimator_orig):
|
||
|
# check that estimators will accept a 'sample_weight' parameter of
|
||
|
# type pandas.Series in the 'fit' function.
|
||
|
estimator = clone(estimator_orig)
|
||
|
if has_fit_parameter(estimator, "sample_weight"):
|
||
|
try:
|
||
|
import pandas as pd
|
||
|
X = pd.DataFrame([[1, 1], [1, 2], [1, 3], [2, 1], [2, 2], [2, 3]])
|
||
|
y = pd.Series([1, 1, 1, 2, 2, 2])
|
||
|
weights = pd.Series([1] * 6)
|
||
|
try:
|
||
|
estimator.fit(X, y, sample_weight=weights)
|
||
|
except ValueError:
|
||
|
raise ValueError("Estimator {0} raises error if "
|
||
|
"'sample_weight' parameter is of "
|
||
|
"type pandas.Series".format(name))
|
||
|
except ImportError:
|
||
|
raise SkipTest("pandas is not installed: not testing for "
|
||
|
"input of type pandas.Series to class weight.")
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_sample_weights_list(name, estimator_orig):
|
||
|
# check that estimators will accept a 'sample_weight' parameter of
|
||
|
# type list in the 'fit' function.
|
||
|
if has_fit_parameter(estimator_orig, "sample_weight"):
|
||
|
estimator = clone(estimator_orig)
|
||
|
rnd = np.random.RandomState(0)
|
||
|
X = rnd.uniform(size=(10, 3))
|
||
|
y = np.arange(10) % 3
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
sample_weight = [3] * 10
|
||
|
# Test that estimators don't raise any exception
|
||
|
estimator.fit(X, y, sample_weight=sample_weight)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning))
|
||
|
def check_dtype_object(name, estimator_orig):
|
||
|
# check that estimators treat dtype object as numeric if possible
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.rand(40, 10).astype(object)
|
||
|
y = (X[:, 0] * 4).astype(np.int)
|
||
|
estimator = clone(estimator_orig)
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
|
||
|
estimator.fit(X, y)
|
||
|
if hasattr(estimator, "predict"):
|
||
|
estimator.predict(X)
|
||
|
|
||
|
if hasattr(estimator, "transform"):
|
||
|
estimator.transform(X)
|
||
|
|
||
|
try:
|
||
|
estimator.fit(X, y.astype(object))
|
||
|
except Exception as e:
|
||
|
if "Unknown label type" not in str(e):
|
||
|
raise
|
||
|
|
||
|
X[0, 0] = {'foo': 'bar'}
|
||
|
msg = "argument must be a string or a number"
|
||
|
assert_raises_regex(TypeError, msg, estimator.fit, X, y)
|
||
|
|
||
|
|
||
|
@ignore_warnings
|
||
|
def check_dict_unchanged(name, estimator_orig):
|
||
|
# this estimator raises
|
||
|
# ValueError: Found array with 0 feature(s) (shape=(23, 0))
|
||
|
# while a minimum of 1 is required.
|
||
|
# error
|
||
|
if name in ['SpectralCoclustering']:
|
||
|
return
|
||
|
rnd = np.random.RandomState(0)
|
||
|
if name in ['RANSACRegressor']:
|
||
|
X = 3 * rnd.uniform(size=(20, 3))
|
||
|
else:
|
||
|
X = 2 * rnd.uniform(size=(20, 3))
|
||
|
|
||
|
y = X[:, 0].astype(np.int)
|
||
|
estimator = clone(estimator_orig)
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
if hasattr(estimator, "n_components"):
|
||
|
estimator.n_components = 1
|
||
|
|
||
|
if hasattr(estimator, "n_clusters"):
|
||
|
estimator.n_clusters = 1
|
||
|
|
||
|
if hasattr(estimator, "n_best"):
|
||
|
estimator.n_best = 1
|
||
|
|
||
|
set_random_state(estimator, 1)
|
||
|
|
||
|
estimator.fit(X, y)
|
||
|
for method in ["predict", "transform", "decision_function",
|
||
|
"predict_proba"]:
|
||
|
if hasattr(estimator, method):
|
||
|
dict_before = estimator.__dict__.copy()
|
||
|
getattr(estimator, method)(X)
|
||
|
assert_dict_equal(estimator.__dict__, dict_before,
|
||
|
'Estimator changes __dict__ during %s' % method)
|
||
|
|
||
|
|
||
|
def is_public_parameter(attr):
|
||
|
return not (attr.startswith('_') or attr.endswith('_'))
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_dont_overwrite_parameters(name, estimator_orig):
|
||
|
# check that fit method only changes or sets private attributes
|
||
|
if hasattr(estimator_orig.__init__, "deprecated_original"):
|
||
|
# to not check deprecated classes
|
||
|
return
|
||
|
estimator = clone(estimator_orig)
|
||
|
rnd = np.random.RandomState(0)
|
||
|
X = 3 * rnd.uniform(size=(20, 3))
|
||
|
y = X[:, 0].astype(np.int)
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
|
||
|
if hasattr(estimator, "n_components"):
|
||
|
estimator.n_components = 1
|
||
|
if hasattr(estimator, "n_clusters"):
|
||
|
estimator.n_clusters = 1
|
||
|
|
||
|
set_random_state(estimator, 1)
|
||
|
dict_before_fit = estimator.__dict__.copy()
|
||
|
estimator.fit(X, y)
|
||
|
|
||
|
dict_after_fit = estimator.__dict__
|
||
|
|
||
|
public_keys_after_fit = [key for key in dict_after_fit.keys()
|
||
|
if is_public_parameter(key)]
|
||
|
|
||
|
attrs_added_by_fit = [key for key in public_keys_after_fit
|
||
|
if key not in dict_before_fit.keys()]
|
||
|
|
||
|
# check that fit doesn't add any public attribute
|
||
|
assert_true(not attrs_added_by_fit,
|
||
|
('Estimator adds public attribute(s) during'
|
||
|
' the fit method.'
|
||
|
' Estimators are only allowed to add private attributes'
|
||
|
' either started with _ or ended'
|
||
|
' with _ but %s added' % ', '.join(attrs_added_by_fit)))
|
||
|
|
||
|
# check that fit doesn't change any public attribute
|
||
|
attrs_changed_by_fit = [key for key in public_keys_after_fit
|
||
|
if (dict_before_fit[key]
|
||
|
is not dict_after_fit[key])]
|
||
|
|
||
|
assert_true(not attrs_changed_by_fit,
|
||
|
('Estimator changes public attribute(s) during'
|
||
|
' the fit method. Estimators are only allowed'
|
||
|
' to change attributes started'
|
||
|
' or ended with _, but'
|
||
|
' %s changed' % ', '.join(attrs_changed_by_fit)))
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_fit2d_predict1d(name, estimator_orig):
|
||
|
# check by fitting a 2d array and predicting with a 1d array
|
||
|
rnd = np.random.RandomState(0)
|
||
|
X = 3 * rnd.uniform(size=(20, 3))
|
||
|
y = X[:, 0].astype(np.int)
|
||
|
estimator = clone(estimator_orig)
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
|
||
|
if hasattr(estimator, "n_components"):
|
||
|
estimator.n_components = 1
|
||
|
if hasattr(estimator, "n_clusters"):
|
||
|
estimator.n_clusters = 1
|
||
|
|
||
|
set_random_state(estimator, 1)
|
||
|
estimator.fit(X, y)
|
||
|
|
||
|
for method in ["predict", "transform", "decision_function",
|
||
|
"predict_proba"]:
|
||
|
if hasattr(estimator, method):
|
||
|
assert_raise_message(ValueError, "Reshape your data",
|
||
|
getattr(estimator, method), X[0])
|
||
|
|
||
|
|
||
|
@ignore_warnings
|
||
|
def check_fit2d_1sample(name, estimator_orig):
|
||
|
# check by fitting a 2d array and prediting with a 1d array
|
||
|
rnd = np.random.RandomState(0)
|
||
|
X = 3 * rnd.uniform(size=(1, 10))
|
||
|
y = X[:, 0].astype(np.int)
|
||
|
estimator = clone(estimator_orig)
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
|
||
|
if hasattr(estimator, "n_components"):
|
||
|
estimator.n_components = 1
|
||
|
if hasattr(estimator, "n_clusters"):
|
||
|
estimator.n_clusters = 1
|
||
|
|
||
|
set_random_state(estimator, 1)
|
||
|
try:
|
||
|
estimator.fit(X, y)
|
||
|
except ValueError:
|
||
|
pass
|
||
|
|
||
|
|
||
|
@ignore_warnings
|
||
|
def check_fit2d_1feature(name, estimator_orig):
|
||
|
# check by fitting a 2d array and prediting with a 1d array
|
||
|
rnd = np.random.RandomState(0)
|
||
|
X = 3 * rnd.uniform(size=(10, 1))
|
||
|
y = X[:, 0].astype(np.int)
|
||
|
estimator = clone(estimator_orig)
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
|
||
|
if hasattr(estimator, "n_components"):
|
||
|
estimator.n_components = 1
|
||
|
if hasattr(estimator, "n_clusters"):
|
||
|
estimator.n_clusters = 1
|
||
|
|
||
|
set_random_state(estimator, 1)
|
||
|
try:
|
||
|
estimator.fit(X, y)
|
||
|
except ValueError:
|
||
|
pass
|
||
|
|
||
|
|
||
|
@ignore_warnings
|
||
|
def check_fit1d_1feature(name, estimator_orig):
|
||
|
# check fitting 1d array with 1 feature
|
||
|
rnd = np.random.RandomState(0)
|
||
|
X = 3 * rnd.uniform(size=(20))
|
||
|
y = X.astype(np.int)
|
||
|
estimator = clone(estimator_orig)
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
|
||
|
if hasattr(estimator, "n_components"):
|
||
|
estimator.n_components = 1
|
||
|
if hasattr(estimator, "n_clusters"):
|
||
|
estimator.n_clusters = 1
|
||
|
|
||
|
set_random_state(estimator, 1)
|
||
|
|
||
|
try:
|
||
|
estimator.fit(X, y)
|
||
|
except ValueError:
|
||
|
pass
|
||
|
|
||
|
|
||
|
@ignore_warnings
|
||
|
def check_fit1d_1sample(name, estimator_orig):
|
||
|
# check fitting 1d array with 1 feature
|
||
|
rnd = np.random.RandomState(0)
|
||
|
X = 3 * rnd.uniform(size=(20))
|
||
|
y = np.array([1])
|
||
|
estimator = clone(estimator_orig)
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
|
||
|
if hasattr(estimator, "n_components"):
|
||
|
estimator.n_components = 1
|
||
|
if hasattr(estimator, "n_clusters"):
|
||
|
estimator.n_clusters = 1
|
||
|
|
||
|
set_random_state(estimator, 1)
|
||
|
|
||
|
try:
|
||
|
estimator.fit(X, y)
|
||
|
except ValueError:
|
||
|
pass
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_transformer_general(name, transformer):
|
||
|
X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
|
||
|
random_state=0, n_features=2, cluster_std=0.1)
|
||
|
X = StandardScaler().fit_transform(X)
|
||
|
X -= X.min()
|
||
|
_check_transformer(name, transformer, X, y)
|
||
|
_check_transformer(name, transformer, X.tolist(), y.tolist())
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_transformer_data_not_an_array(name, transformer):
|
||
|
X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
|
||
|
random_state=0, n_features=2, cluster_std=0.1)
|
||
|
X = StandardScaler().fit_transform(X)
|
||
|
# We need to make sure that we have non negative data, for things
|
||
|
# like NMF
|
||
|
X -= X.min() - .1
|
||
|
this_X = NotAnArray(X)
|
||
|
this_y = NotAnArray(np.asarray(y))
|
||
|
_check_transformer(name, transformer, this_X, this_y)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_transformers_unfitted(name, transformer):
|
||
|
X, y = _boston_subset()
|
||
|
|
||
|
transformer = clone(transformer)
|
||
|
assert_raises((AttributeError, ValueError), transformer.transform, X)
|
||
|
|
||
|
|
||
|
def _check_transformer(name, transformer_orig, X, y):
|
||
|
if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit():
|
||
|
# Those transformers yield non-deterministic output when executed on
|
||
|
# a 32bit Python. The same transformers are stable on 64bit Python.
|
||
|
# FIXME: try to isolate a minimalistic reproduction case only depending
|
||
|
# on numpy & scipy and/or maybe generate a test dataset that does not
|
||
|
# cause such unstable behaviors.
|
||
|
msg = name + ' is non deterministic on 32bit Python'
|
||
|
raise SkipTest(msg)
|
||
|
n_samples, n_features = np.asarray(X).shape
|
||
|
transformer = clone(transformer_orig)
|
||
|
set_random_state(transformer)
|
||
|
|
||
|
# fit
|
||
|
|
||
|
if name in CROSS_DECOMPOSITION:
|
||
|
y_ = np.c_[y, y]
|
||
|
y_[::2, 1] *= 2
|
||
|
else:
|
||
|
y_ = y
|
||
|
|
||
|
transformer.fit(X, y_)
|
||
|
# fit_transform method should work on non fitted estimator
|
||
|
transformer_clone = clone(transformer)
|
||
|
X_pred = transformer_clone.fit_transform(X, y=y_)
|
||
|
|
||
|
if isinstance(X_pred, tuple):
|
||
|
for x_pred in X_pred:
|
||
|
assert_equal(x_pred.shape[0], n_samples)
|
||
|
else:
|
||
|
# check for consistent n_samples
|
||
|
assert_equal(X_pred.shape[0], n_samples)
|
||
|
|
||
|
if hasattr(transformer, 'transform'):
|
||
|
if name in CROSS_DECOMPOSITION:
|
||
|
X_pred2 = transformer.transform(X, y_)
|
||
|
X_pred3 = transformer.fit_transform(X, y=y_)
|
||
|
else:
|
||
|
X_pred2 = transformer.transform(X)
|
||
|
X_pred3 = transformer.fit_transform(X, y=y_)
|
||
|
if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
|
||
|
for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3):
|
||
|
assert_allclose_dense_sparse(
|
||
|
x_pred, x_pred2, atol=1e-2,
|
||
|
err_msg="fit_transform and transform outcomes "
|
||
|
"not consistent in %s"
|
||
|
% transformer)
|
||
|
assert_allclose_dense_sparse(
|
||
|
x_pred, x_pred3, atol=1e-2,
|
||
|
err_msg="consecutive fit_transform outcomes "
|
||
|
"not consistent in %s"
|
||
|
% transformer)
|
||
|
else:
|
||
|
assert_allclose_dense_sparse(
|
||
|
X_pred, X_pred2,
|
||
|
err_msg="fit_transform and transform outcomes "
|
||
|
"not consistent in %s"
|
||
|
% transformer, atol=1e-2)
|
||
|
assert_allclose_dense_sparse(
|
||
|
X_pred, X_pred3, atol=1e-2,
|
||
|
err_msg="consecutive fit_transform outcomes "
|
||
|
"not consistent in %s"
|
||
|
% transformer)
|
||
|
assert_equal(_num_samples(X_pred2), n_samples)
|
||
|
assert_equal(_num_samples(X_pred3), n_samples)
|
||
|
|
||
|
# raises error on malformed input for transform
|
||
|
if hasattr(X, 'T'):
|
||
|
# If it's not an array, it does not have a 'T' property
|
||
|
assert_raises(ValueError, transformer.transform, X.T)
|
||
|
|
||
|
|
||
|
@ignore_warnings
|
||
|
def check_pipeline_consistency(name, estimator_orig):
|
||
|
if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit():
|
||
|
# Those transformers yield non-deterministic output when executed on
|
||
|
# a 32bit Python. The same transformers are stable on 64bit Python.
|
||
|
# FIXME: try to isolate a minimalistic reproduction case only depending
|
||
|
# scipy and/or maybe generate a test dataset that does not
|
||
|
# cause such unstable behaviors.
|
||
|
msg = name + ' is non deterministic on 32bit Python'
|
||
|
raise SkipTest(msg)
|
||
|
|
||
|
# check that make_pipeline(est) gives same score as est
|
||
|
X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
|
||
|
random_state=0, n_features=2, cluster_std=0.1)
|
||
|
X -= X.min()
|
||
|
estimator = clone(estimator_orig)
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
set_random_state(estimator)
|
||
|
pipeline = make_pipeline(estimator)
|
||
|
estimator.fit(X, y)
|
||
|
pipeline.fit(X, y)
|
||
|
|
||
|
funcs = ["score", "fit_transform"]
|
||
|
|
||
|
for func_name in funcs:
|
||
|
func = getattr(estimator, func_name, None)
|
||
|
if func is not None:
|
||
|
func_pipeline = getattr(pipeline, func_name)
|
||
|
result = func(X, y)
|
||
|
result_pipe = func_pipeline(X, y)
|
||
|
assert_allclose_dense_sparse(result, result_pipe)
|
||
|
|
||
|
|
||
|
@ignore_warnings
|
||
|
def check_fit_score_takes_y(name, estimator_orig):
|
||
|
# check that all estimators accept an optional y
|
||
|
# in fit and score so they can be used in pipelines
|
||
|
rnd = np.random.RandomState(0)
|
||
|
X = rnd.uniform(size=(10, 3))
|
||
|
y = np.arange(10) % 3
|
||
|
estimator = clone(estimator_orig)
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
set_random_state(estimator)
|
||
|
|
||
|
funcs = ["fit", "score", "partial_fit", "fit_predict", "fit_transform"]
|
||
|
for func_name in funcs:
|
||
|
func = getattr(estimator, func_name, None)
|
||
|
if func is not None:
|
||
|
func(X, y)
|
||
|
args = [p.name for p in signature(func).parameters.values()]
|
||
|
if args[0] == "self":
|
||
|
# if_delegate_has_method makes methods into functions
|
||
|
# with an explicit "self", so need to shift arguments
|
||
|
args = args[1:]
|
||
|
assert_true(args[1] in ["y", "Y"],
|
||
|
"Expected y or Y as second argument for method "
|
||
|
"%s of %s. Got arguments: %r."
|
||
|
% (func_name, type(estimator).__name__, args))
|
||
|
|
||
|
|
||
|
@ignore_warnings
|
||
|
def check_estimators_dtypes(name, estimator_orig):
|
||
|
rnd = np.random.RandomState(0)
|
||
|
X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32)
|
||
|
X_train_64 = X_train_32.astype(np.float64)
|
||
|
X_train_int_64 = X_train_32.astype(np.int64)
|
||
|
X_train_int_32 = X_train_32.astype(np.int32)
|
||
|
y = X_train_int_64[:, 0]
|
||
|
y = multioutput_estimator_convert_y_2d(estimator_orig, y)
|
||
|
|
||
|
methods = ["predict", "transform", "decision_function", "predict_proba"]
|
||
|
|
||
|
for X_train in [X_train_32, X_train_64, X_train_int_64, X_train_int_32]:
|
||
|
estimator = clone(estimator_orig)
|
||
|
set_random_state(estimator, 1)
|
||
|
estimator.fit(X_train, y)
|
||
|
|
||
|
for method in methods:
|
||
|
if hasattr(estimator, method):
|
||
|
getattr(estimator, method)(X_train)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_estimators_empty_data_messages(name, estimator_orig):
|
||
|
e = clone(estimator_orig)
|
||
|
set_random_state(e, 1)
|
||
|
|
||
|
X_zero_samples = np.empty(0).reshape(0, 3)
|
||
|
# The precise message can change depending on whether X or y is
|
||
|
# validated first. Let us test the type of exception only:
|
||
|
assert_raises(ValueError, e.fit, X_zero_samples, [])
|
||
|
|
||
|
X_zero_features = np.empty(0).reshape(3, 0)
|
||
|
# the following y should be accepted by both classifiers and regressors
|
||
|
# and ignored by unsupervised models
|
||
|
y = multioutput_estimator_convert_y_2d(e, np.array([1, 0, 1]))
|
||
|
msg = ("0 feature\(s\) \(shape=\(3, 0\)\) while a minimum of \d* "
|
||
|
"is required.")
|
||
|
assert_raises_regex(ValueError, msg, e.fit, X_zero_features, y)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=DeprecationWarning)
|
||
|
def check_estimators_nan_inf(name, estimator_orig):
|
||
|
# Checks that Estimator X's do not contain NaN or inf.
|
||
|
rnd = np.random.RandomState(0)
|
||
|
X_train_finite = rnd.uniform(size=(10, 3))
|
||
|
X_train_nan = rnd.uniform(size=(10, 3))
|
||
|
X_train_nan[0, 0] = np.nan
|
||
|
X_train_inf = rnd.uniform(size=(10, 3))
|
||
|
X_train_inf[0, 0] = np.inf
|
||
|
y = np.ones(10)
|
||
|
y[:5] = 0
|
||
|
y = multioutput_estimator_convert_y_2d(estimator_orig, y)
|
||
|
error_string_fit = "Estimator doesn't check for NaN and inf in fit."
|
||
|
error_string_predict = ("Estimator doesn't check for NaN and inf in"
|
||
|
" predict.")
|
||
|
error_string_transform = ("Estimator doesn't check for NaN and inf in"
|
||
|
" transform.")
|
||
|
for X_train in [X_train_nan, X_train_inf]:
|
||
|
# catch deprecation warnings
|
||
|
with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
|
||
|
estimator = clone(estimator_orig)
|
||
|
set_random_state(estimator, 1)
|
||
|
# try to fit
|
||
|
try:
|
||
|
estimator.fit(X_train, y)
|
||
|
except ValueError as e:
|
||
|
if 'inf' not in repr(e) and 'NaN' not in repr(e):
|
||
|
print(error_string_fit, estimator, e)
|
||
|
traceback.print_exc(file=sys.stdout)
|
||
|
raise e
|
||
|
except Exception as exc:
|
||
|
print(error_string_fit, estimator, exc)
|
||
|
traceback.print_exc(file=sys.stdout)
|
||
|
raise exc
|
||
|
else:
|
||
|
raise AssertionError(error_string_fit, estimator)
|
||
|
# actually fit
|
||
|
estimator.fit(X_train_finite, y)
|
||
|
|
||
|
# predict
|
||
|
if hasattr(estimator, "predict"):
|
||
|
try:
|
||
|
estimator.predict(X_train)
|
||
|
except ValueError as e:
|
||
|
if 'inf' not in repr(e) and 'NaN' not in repr(e):
|
||
|
print(error_string_predict, estimator, e)
|
||
|
traceback.print_exc(file=sys.stdout)
|
||
|
raise e
|
||
|
except Exception as exc:
|
||
|
print(error_string_predict, estimator, exc)
|
||
|
traceback.print_exc(file=sys.stdout)
|
||
|
else:
|
||
|
raise AssertionError(error_string_predict, estimator)
|
||
|
|
||
|
# transform
|
||
|
if hasattr(estimator, "transform"):
|
||
|
try:
|
||
|
estimator.transform(X_train)
|
||
|
except ValueError as e:
|
||
|
if 'inf' not in repr(e) and 'NaN' not in repr(e):
|
||
|
print(error_string_transform, estimator, e)
|
||
|
traceback.print_exc(file=sys.stdout)
|
||
|
raise e
|
||
|
except Exception as exc:
|
||
|
print(error_string_transform, estimator, exc)
|
||
|
traceback.print_exc(file=sys.stdout)
|
||
|
else:
|
||
|
raise AssertionError(error_string_transform, estimator)
|
||
|
|
||
|
|
||
|
@ignore_warnings
|
||
|
def check_estimators_pickle(name, estimator_orig):
|
||
|
"""Test that we can pickle all estimators"""
|
||
|
check_methods = ["predict", "transform", "decision_function",
|
||
|
"predict_proba"]
|
||
|
|
||
|
X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
|
||
|
random_state=0, n_features=2, cluster_std=0.1)
|
||
|
|
||
|
# some estimators can't do features less than 0
|
||
|
X -= X.min()
|
||
|
|
||
|
estimator = clone(estimator_orig)
|
||
|
|
||
|
# some estimators only take multioutputs
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
|
||
|
set_random_state(estimator)
|
||
|
estimator.fit(X, y)
|
||
|
|
||
|
result = dict()
|
||
|
for method in check_methods:
|
||
|
if hasattr(estimator, method):
|
||
|
result[method] = getattr(estimator, method)(X)
|
||
|
|
||
|
# pickle and unpickle!
|
||
|
pickled_estimator = pickle.dumps(estimator)
|
||
|
if estimator.__module__.startswith('sklearn.'):
|
||
|
assert_true(b"version" in pickled_estimator)
|
||
|
unpickled_estimator = pickle.loads(pickled_estimator)
|
||
|
|
||
|
for method in result:
|
||
|
unpickled_result = getattr(unpickled_estimator, method)(X)
|
||
|
assert_allclose_dense_sparse(result[method], unpickled_result)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_estimators_partial_fit_n_features(name, estimator_orig):
|
||
|
# check if number of features changes between calls to partial_fit.
|
||
|
if not hasattr(estimator_orig, 'partial_fit'):
|
||
|
return
|
||
|
estimator = clone(estimator_orig)
|
||
|
X, y = make_blobs(n_samples=50, random_state=1)
|
||
|
X -= X.min()
|
||
|
|
||
|
try:
|
||
|
if is_classifier(estimator):
|
||
|
classes = np.unique(y)
|
||
|
estimator.partial_fit(X, y, classes=classes)
|
||
|
else:
|
||
|
estimator.partial_fit(X, y)
|
||
|
except NotImplementedError:
|
||
|
return
|
||
|
|
||
|
assert_raises(ValueError, estimator.partial_fit, X[:, :-1], y)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_clustering(name, clusterer_orig):
|
||
|
clusterer = clone(clusterer_orig)
|
||
|
X, y = make_blobs(n_samples=50, random_state=1)
|
||
|
X, y = shuffle(X, y, random_state=7)
|
||
|
X = StandardScaler().fit_transform(X)
|
||
|
n_samples, n_features = X.shape
|
||
|
# catch deprecation and neighbors warnings
|
||
|
if hasattr(clusterer, "n_clusters"):
|
||
|
clusterer.set_params(n_clusters=3)
|
||
|
set_random_state(clusterer)
|
||
|
if name == 'AffinityPropagation':
|
||
|
clusterer.set_params(preference=-100)
|
||
|
clusterer.set_params(max_iter=100)
|
||
|
|
||
|
# fit
|
||
|
clusterer.fit(X)
|
||
|
# with lists
|
||
|
clusterer.fit(X.tolist())
|
||
|
|
||
|
assert_equal(clusterer.labels_.shape, (n_samples,))
|
||
|
pred = clusterer.labels_
|
||
|
assert_greater(adjusted_rand_score(pred, y), 0.4)
|
||
|
# fit another time with ``fit_predict`` and compare results
|
||
|
if name == 'SpectralClustering':
|
||
|
# there is no way to make Spectral clustering deterministic :(
|
||
|
return
|
||
|
set_random_state(clusterer)
|
||
|
with warnings.catch_warnings(record=True):
|
||
|
pred2 = clusterer.fit_predict(X)
|
||
|
assert_array_equal(pred, pred2)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=DeprecationWarning)
|
||
|
def check_clusterer_compute_labels_predict(name, clusterer_orig):
|
||
|
"""Check that predict is invariant of compute_labels"""
|
||
|
X, y = make_blobs(n_samples=20, random_state=0)
|
||
|
clusterer = clone(clusterer_orig)
|
||
|
|
||
|
if hasattr(clusterer, "compute_labels"):
|
||
|
# MiniBatchKMeans
|
||
|
if hasattr(clusterer, "random_state"):
|
||
|
clusterer.set_params(random_state=0)
|
||
|
|
||
|
X_pred1 = clusterer.fit(X).predict(X)
|
||
|
clusterer.set_params(compute_labels=False)
|
||
|
X_pred2 = clusterer.fit(X).predict(X)
|
||
|
assert_array_equal(X_pred1, X_pred2)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=DeprecationWarning)
|
||
|
def check_classifiers_one_label(name, classifier_orig):
|
||
|
error_string_fit = "Classifier can't train when only one class is present."
|
||
|
error_string_predict = ("Classifier can't predict when only one class is "
|
||
|
"present.")
|
||
|
rnd = np.random.RandomState(0)
|
||
|
X_train = rnd.uniform(size=(10, 3))
|
||
|
X_test = rnd.uniform(size=(10, 3))
|
||
|
y = np.ones(10)
|
||
|
# catch deprecation warnings
|
||
|
with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
|
||
|
classifier = clone(classifier_orig)
|
||
|
# try to fit
|
||
|
try:
|
||
|
classifier.fit(X_train, y)
|
||
|
except ValueError as e:
|
||
|
if 'class' not in repr(e):
|
||
|
print(error_string_fit, classifier, e)
|
||
|
traceback.print_exc(file=sys.stdout)
|
||
|
raise e
|
||
|
else:
|
||
|
return
|
||
|
except Exception as exc:
|
||
|
print(error_string_fit, classifier, exc)
|
||
|
traceback.print_exc(file=sys.stdout)
|
||
|
raise exc
|
||
|
# predict
|
||
|
try:
|
||
|
assert_array_equal(classifier.predict(X_test), y)
|
||
|
except Exception as exc:
|
||
|
print(error_string_predict, classifier, exc)
|
||
|
raise exc
|
||
|
|
||
|
|
||
|
@ignore_warnings # Warnings are raised by decision function
|
||
|
def check_classifiers_train(name, classifier_orig):
|
||
|
X_m, y_m = make_blobs(n_samples=300, random_state=0)
|
||
|
X_m, y_m = shuffle(X_m, y_m, random_state=7)
|
||
|
X_m = StandardScaler().fit_transform(X_m)
|
||
|
# generate binary problem from multi-class one
|
||
|
y_b = y_m[y_m != 2]
|
||
|
X_b = X_m[y_m != 2]
|
||
|
for (X, y) in [(X_m, y_m), (X_b, y_b)]:
|
||
|
classes = np.unique(y)
|
||
|
n_classes = len(classes)
|
||
|
n_samples, n_features = X.shape
|
||
|
classifier = clone(classifier_orig)
|
||
|
if name in ['BernoulliNB', 'MultinomialNB']:
|
||
|
X -= X.min()
|
||
|
set_random_state(classifier)
|
||
|
# raises error on malformed input for fit
|
||
|
assert_raises(ValueError, classifier.fit, X, y[:-1])
|
||
|
|
||
|
# fit
|
||
|
classifier.fit(X, y)
|
||
|
# with lists
|
||
|
classifier.fit(X.tolist(), y.tolist())
|
||
|
assert_true(hasattr(classifier, "classes_"))
|
||
|
y_pred = classifier.predict(X)
|
||
|
assert_equal(y_pred.shape, (n_samples,))
|
||
|
# training set performance
|
||
|
if name not in ['BernoulliNB', 'MultinomialNB']:
|
||
|
assert_greater(accuracy_score(y, y_pred), 0.83)
|
||
|
|
||
|
# raises error on malformed input for predict
|
||
|
assert_raises(ValueError, classifier.predict, X.T)
|
||
|
if hasattr(classifier, "decision_function"):
|
||
|
try:
|
||
|
# decision_function agrees with predict
|
||
|
decision = classifier.decision_function(X)
|
||
|
if n_classes == 2:
|
||
|
assert_equal(decision.shape, (n_samples,))
|
||
|
dec_pred = (decision.ravel() > 0).astype(np.int)
|
||
|
assert_array_equal(dec_pred, y_pred)
|
||
|
if (n_classes == 3 and
|
||
|
# 1on1 of LibSVM works differently
|
||
|
not isinstance(classifier, BaseLibSVM)):
|
||
|
assert_equal(decision.shape, (n_samples, n_classes))
|
||
|
assert_array_equal(np.argmax(decision, axis=1), y_pred)
|
||
|
|
||
|
# raises error on malformed input
|
||
|
assert_raises(ValueError,
|
||
|
classifier.decision_function, X.T)
|
||
|
# raises error on malformed input for decision_function
|
||
|
assert_raises(ValueError,
|
||
|
classifier.decision_function, X.T)
|
||
|
except NotImplementedError:
|
||
|
pass
|
||
|
if hasattr(classifier, "predict_proba"):
|
||
|
# predict_proba agrees with predict
|
||
|
y_prob = classifier.predict_proba(X)
|
||
|
assert_equal(y_prob.shape, (n_samples, n_classes))
|
||
|
assert_array_equal(np.argmax(y_prob, axis=1), y_pred)
|
||
|
# check that probas for all classes sum to one
|
||
|
assert_allclose(np.sum(y_prob, axis=1), np.ones(n_samples))
|
||
|
# raises error on malformed input
|
||
|
assert_raises(ValueError, classifier.predict_proba, X.T)
|
||
|
# raises error on malformed input for predict_proba
|
||
|
assert_raises(ValueError, classifier.predict_proba, X.T)
|
||
|
if hasattr(classifier, "predict_log_proba"):
|
||
|
# predict_log_proba is a transformation of predict_proba
|
||
|
y_log_prob = classifier.predict_log_proba(X)
|
||
|
assert_allclose(y_log_prob, np.log(y_prob), 8, atol=1e-9)
|
||
|
assert_array_equal(np.argsort(y_log_prob), np.argsort(y_prob))
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_estimators_fit_returns_self(name, estimator_orig):
|
||
|
"""Check if self is returned when calling fit"""
|
||
|
X, y = make_blobs(random_state=0, n_samples=9, n_features=4)
|
||
|
# some want non-negative input
|
||
|
X -= X.min()
|
||
|
|
||
|
estimator = clone(estimator_orig)
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
|
||
|
set_random_state(estimator)
|
||
|
|
||
|
assert_true(estimator.fit(X, y) is estimator)
|
||
|
|
||
|
|
||
|
@ignore_warnings
|
||
|
def check_estimators_unfitted(name, estimator_orig):
|
||
|
"""Check that predict raises an exception in an unfitted estimator.
|
||
|
|
||
|
Unfitted estimators should raise either AttributeError or ValueError.
|
||
|
The specific exception type NotFittedError inherits from both and can
|
||
|
therefore be adequately raised for that purpose.
|
||
|
"""
|
||
|
|
||
|
# Common test for Regressors as well as Classifiers
|
||
|
X, y = _boston_subset()
|
||
|
|
||
|
est = clone(estimator_orig)
|
||
|
|
||
|
msg = "fit"
|
||
|
if hasattr(est, 'predict'):
|
||
|
assert_raise_message((AttributeError, ValueError), msg,
|
||
|
est.predict, X)
|
||
|
|
||
|
if hasattr(est, 'decision_function'):
|
||
|
assert_raise_message((AttributeError, ValueError), msg,
|
||
|
est.decision_function, X)
|
||
|
|
||
|
if hasattr(est, 'predict_proba'):
|
||
|
assert_raise_message((AttributeError, ValueError), msg,
|
||
|
est.predict_proba, X)
|
||
|
|
||
|
if hasattr(est, 'predict_log_proba'):
|
||
|
assert_raise_message((AttributeError, ValueError), msg,
|
||
|
est.predict_log_proba, X)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_supervised_y_2d(name, estimator_orig):
|
||
|
if "MultiTask" in name:
|
||
|
# These only work on 2d, so this test makes no sense
|
||
|
return
|
||
|
if name == "GaussianProcess":
|
||
|
# Workaround: https://github.com/scikit-learn/scikit-learn/issues/10562
|
||
|
return
|
||
|
rnd = np.random.RandomState(0)
|
||
|
X = rnd.uniform(size=(10, 3))
|
||
|
y = np.arange(10) % 3
|
||
|
estimator = clone(estimator_orig)
|
||
|
set_random_state(estimator)
|
||
|
# fit
|
||
|
estimator.fit(X, y)
|
||
|
y_pred = estimator.predict(X)
|
||
|
|
||
|
set_random_state(estimator)
|
||
|
# Check that when a 2D y is given, a DataConversionWarning is
|
||
|
# raised
|
||
|
with warnings.catch_warnings(record=True) as w:
|
||
|
warnings.simplefilter("always", DataConversionWarning)
|
||
|
warnings.simplefilter("ignore", RuntimeWarning)
|
||
|
estimator.fit(X, y[:, np.newaxis])
|
||
|
y_pred_2d = estimator.predict(X)
|
||
|
msg = "expected 1 DataConversionWarning, got: %s" % (
|
||
|
", ".join([str(w_x) for w_x in w]))
|
||
|
if name not in MULTI_OUTPUT:
|
||
|
# check that we warned if we don't support multi-output
|
||
|
assert_greater(len(w), 0, msg)
|
||
|
assert_true("DataConversionWarning('A column-vector y"
|
||
|
" was passed when a 1d array was expected" in msg)
|
||
|
assert_allclose(y_pred.ravel(), y_pred_2d.ravel())
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_classifiers_classes(name, classifier_orig):
|
||
|
X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
|
||
|
X, y = shuffle(X, y, random_state=7)
|
||
|
X = StandardScaler().fit_transform(X)
|
||
|
# We need to make sure that we have non negative data, for things
|
||
|
# like NMF
|
||
|
X -= X.min() - .1
|
||
|
y_names = np.array(["one", "two", "three"])[y]
|
||
|
|
||
|
for y_names in [y_names, y_names.astype('O')]:
|
||
|
if name in ["LabelPropagation", "LabelSpreading"]:
|
||
|
# TODO some complication with -1 label
|
||
|
y_ = y
|
||
|
else:
|
||
|
y_ = y_names
|
||
|
|
||
|
classes = np.unique(y_)
|
||
|
classifier = clone(classifier_orig)
|
||
|
if name == 'BernoulliNB':
|
||
|
classifier.set_params(binarize=X.mean())
|
||
|
set_random_state(classifier)
|
||
|
# fit
|
||
|
classifier.fit(X, y_)
|
||
|
|
||
|
y_pred = classifier.predict(X)
|
||
|
# training set performance
|
||
|
assert_array_equal(np.unique(y_), np.unique(y_pred))
|
||
|
if np.any(classifier.classes_ != classes):
|
||
|
print("Unexpected classes_ attribute for %r: "
|
||
|
"expected %s, got %s" %
|
||
|
(classifier, classes, classifier.classes_))
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_regressors_int(name, regressor_orig):
|
||
|
X, _ = _boston_subset()
|
||
|
X = X[:50]
|
||
|
rnd = np.random.RandomState(0)
|
||
|
y = rnd.randint(3, size=X.shape[0])
|
||
|
y = multioutput_estimator_convert_y_2d(regressor_orig, y)
|
||
|
rnd = np.random.RandomState(0)
|
||
|
# separate estimators to control random seeds
|
||
|
regressor_1 = clone(regressor_orig)
|
||
|
regressor_2 = clone(regressor_orig)
|
||
|
set_random_state(regressor_1)
|
||
|
set_random_state(regressor_2)
|
||
|
|
||
|
if name in CROSS_DECOMPOSITION:
|
||
|
y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])
|
||
|
y_ = y_.T
|
||
|
else:
|
||
|
y_ = y
|
||
|
|
||
|
# fit
|
||
|
regressor_1.fit(X, y_)
|
||
|
pred1 = regressor_1.predict(X)
|
||
|
regressor_2.fit(X, y_.astype(np.float))
|
||
|
pred2 = regressor_2.predict(X)
|
||
|
assert_allclose(pred1, pred2, atol=1e-2, err_msg=name)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_regressors_train(name, regressor_orig):
|
||
|
X, y = _boston_subset()
|
||
|
y = StandardScaler().fit_transform(y.reshape(-1, 1)) # X is already scaled
|
||
|
y = y.ravel()
|
||
|
regressor = clone(regressor_orig)
|
||
|
y = multioutput_estimator_convert_y_2d(regressor, y)
|
||
|
rnd = np.random.RandomState(0)
|
||
|
if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'):
|
||
|
# linear regressors need to set alpha, but not generalized CV ones
|
||
|
regressor.alpha = 0.01
|
||
|
if name == 'PassiveAggressiveRegressor':
|
||
|
regressor.C = 0.01
|
||
|
|
||
|
# raises error on malformed input for fit
|
||
|
assert_raises(ValueError, regressor.fit, X, y[:-1])
|
||
|
# fit
|
||
|
if name in CROSS_DECOMPOSITION:
|
||
|
y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])
|
||
|
y_ = y_.T
|
||
|
else:
|
||
|
y_ = y
|
||
|
set_random_state(regressor)
|
||
|
regressor.fit(X, y_)
|
||
|
regressor.fit(X.tolist(), y_.tolist())
|
||
|
y_pred = regressor.predict(X)
|
||
|
assert_equal(y_pred.shape, y_.shape)
|
||
|
|
||
|
# TODO: find out why PLS and CCA fail. RANSAC is random
|
||
|
# and furthermore assumes the presence of outliers, hence
|
||
|
# skipped
|
||
|
if name not in ('PLSCanonical', 'CCA', 'RANSACRegressor'):
|
||
|
assert_greater(regressor.score(X, y_), 0.5)
|
||
|
|
||
|
|
||
|
@ignore_warnings
|
||
|
def check_regressors_no_decision_function(name, regressor_orig):
|
||
|
# checks whether regressors have decision_function or predict_proba
|
||
|
rng = np.random.RandomState(0)
|
||
|
X = rng.normal(size=(10, 4))
|
||
|
regressor = clone(regressor_orig)
|
||
|
y = multioutput_estimator_convert_y_2d(regressor, X[:, 0])
|
||
|
|
||
|
if hasattr(regressor, "n_components"):
|
||
|
# FIXME CCA, PLS is not robust to rank 1 effects
|
||
|
regressor.n_components = 1
|
||
|
|
||
|
regressor.fit(X, y)
|
||
|
funcs = ["decision_function", "predict_proba", "predict_log_proba"]
|
||
|
for func_name in funcs:
|
||
|
func = getattr(regressor, func_name, None)
|
||
|
if func is None:
|
||
|
# doesn't have function
|
||
|
continue
|
||
|
# has function. Should raise deprecation warning
|
||
|
msg = func_name
|
||
|
assert_warns_message(DeprecationWarning, msg, func, X)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_class_weight_classifiers(name, classifier_orig):
|
||
|
if name == "NuSVC":
|
||
|
# the sparse version has a parameter that doesn't do anything
|
||
|
raise SkipTest
|
||
|
if name.endswith("NB"):
|
||
|
# NaiveBayes classifiers have a somewhat different interface.
|
||
|
# FIXME SOON!
|
||
|
raise SkipTest
|
||
|
|
||
|
for n_centers in [2, 3]:
|
||
|
# create a very noisy dataset
|
||
|
X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20)
|
||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
|
||
|
random_state=0)
|
||
|
n_centers = len(np.unique(y_train))
|
||
|
|
||
|
if n_centers == 2:
|
||
|
class_weight = {0: 1000, 1: 0.0001}
|
||
|
else:
|
||
|
class_weight = {0: 1000, 1: 0.0001, 2: 0.0001}
|
||
|
|
||
|
classifier = clone(classifier_orig).set_params(
|
||
|
class_weight=class_weight)
|
||
|
if hasattr(classifier, "n_iter"):
|
||
|
classifier.set_params(n_iter=100)
|
||
|
if hasattr(classifier, "max_iter"):
|
||
|
classifier.set_params(max_iter=1000)
|
||
|
if hasattr(classifier, "min_weight_fraction_leaf"):
|
||
|
classifier.set_params(min_weight_fraction_leaf=0.01)
|
||
|
|
||
|
set_random_state(classifier)
|
||
|
classifier.fit(X_train, y_train)
|
||
|
y_pred = classifier.predict(X_test)
|
||
|
# XXX: Generally can use 0.89 here. On Windows, LinearSVC gets
|
||
|
# 0.88 (Issue #9111)
|
||
|
assert_greater(np.mean(y_pred == 0), 0.87)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_class_weight_balanced_classifiers(name, classifier_orig, X_train,
|
||
|
y_train, X_test, y_test, weights):
|
||
|
classifier = clone(classifier_orig)
|
||
|
if hasattr(classifier, "n_iter"):
|
||
|
classifier.set_params(n_iter=100)
|
||
|
if hasattr(classifier, "max_iter"):
|
||
|
classifier.set_params(max_iter=1000)
|
||
|
|
||
|
set_random_state(classifier)
|
||
|
classifier.fit(X_train, y_train)
|
||
|
y_pred = classifier.predict(X_test)
|
||
|
|
||
|
classifier.set_params(class_weight='balanced')
|
||
|
classifier.fit(X_train, y_train)
|
||
|
y_pred_balanced = classifier.predict(X_test)
|
||
|
assert_greater(f1_score(y_test, y_pred_balanced, average='weighted'),
|
||
|
f1_score(y_test, y_pred, average='weighted'))
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_class_weight_balanced_linear_classifier(name, Classifier):
|
||
|
"""Test class weights with non-contiguous class labels."""
|
||
|
# this is run on classes, not instances, though this should be changed
|
||
|
X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
|
||
|
[1.0, 1.0], [1.0, 0.0]])
|
||
|
y = np.array([1, 1, 1, -1, -1])
|
||
|
|
||
|
classifier = Classifier()
|
||
|
|
||
|
if hasattr(classifier, "n_iter"):
|
||
|
# This is a very small dataset, default n_iter are likely to prevent
|
||
|
# convergence
|
||
|
classifier.set_params(n_iter=1000)
|
||
|
if hasattr(classifier, "max_iter"):
|
||
|
classifier.set_params(max_iter=1000)
|
||
|
set_random_state(classifier)
|
||
|
|
||
|
# Let the model compute the class frequencies
|
||
|
classifier.set_params(class_weight='balanced')
|
||
|
coef_balanced = classifier.fit(X, y).coef_.copy()
|
||
|
|
||
|
# Count each label occurrence to reweight manually
|
||
|
n_samples = len(y)
|
||
|
n_classes = float(len(np.unique(y)))
|
||
|
|
||
|
class_weight = {1: n_samples / (np.sum(y == 1) * n_classes),
|
||
|
-1: n_samples / (np.sum(y == -1) * n_classes)}
|
||
|
classifier.set_params(class_weight=class_weight)
|
||
|
coef_manual = classifier.fit(X, y).coef_.copy()
|
||
|
|
||
|
assert_allclose(coef_balanced, coef_manual)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_estimators_overwrite_params(name, estimator_orig):
|
||
|
X, y = make_blobs(random_state=0, n_samples=9)
|
||
|
# some want non-negative input
|
||
|
X -= X.min()
|
||
|
estimator = clone(estimator_orig)
|
||
|
y = multioutput_estimator_convert_y_2d(estimator, y)
|
||
|
|
||
|
set_random_state(estimator)
|
||
|
|
||
|
# Make a physical copy of the original estimator parameters before fitting.
|
||
|
params = estimator.get_params()
|
||
|
original_params = deepcopy(params)
|
||
|
|
||
|
# Fit the model
|
||
|
estimator.fit(X, y)
|
||
|
|
||
|
# Compare the state of the model parameters with the original parameters
|
||
|
new_params = estimator.get_params()
|
||
|
for param_name, original_value in original_params.items():
|
||
|
new_value = new_params[param_name]
|
||
|
|
||
|
# We should never change or mutate the internal state of input
|
||
|
# parameters by default. To check this we use the joblib.hash function
|
||
|
# that introspects recursively any subobjects to compute a checksum.
|
||
|
# The only exception to this rule of immutable constructor parameters
|
||
|
# is possible RandomState instance but in this check we explicitly
|
||
|
# fixed the random_state params recursively to be integer seeds.
|
||
|
assert_equal(hash(new_value), hash(original_value),
|
||
|
"Estimator %s should not change or mutate "
|
||
|
" the parameter %s from %s to %s during fit."
|
||
|
% (name, param_name, original_value, new_value))
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_no_fit_attributes_set_in_init(name, Estimator):
|
||
|
"""Check that Estimator.__init__ doesn't set trailing-_ attributes."""
|
||
|
# this check works on classes, not instances
|
||
|
estimator = Estimator()
|
||
|
for attr in dir(estimator):
|
||
|
if attr.endswith("_") and not attr.startswith("__"):
|
||
|
# This check is for properties, they can be listed in dir
|
||
|
# while at the same time have hasattr return False as long
|
||
|
# as the property getter raises an AttributeError
|
||
|
assert_false(
|
||
|
hasattr(estimator, attr),
|
||
|
"By convention, attributes ending with '_' are "
|
||
|
'estimated from data in scikit-learn. Consequently they '
|
||
|
'should not be initialized in the constructor of an '
|
||
|
'estimator but in the fit method. Attribute {!r} '
|
||
|
'was found in estimator {}'.format(attr, name))
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_sparsify_coefficients(name, estimator_orig):
|
||
|
X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1],
|
||
|
[-1, -2], [2, 2], [-2, -2]])
|
||
|
y = [1, 1, 1, 2, 2, 2, 3, 3, 3]
|
||
|
est = clone(estimator_orig)
|
||
|
|
||
|
est.fit(X, y)
|
||
|
pred_orig = est.predict(X)
|
||
|
|
||
|
# test sparsify with dense inputs
|
||
|
est.sparsify()
|
||
|
assert_true(sparse.issparse(est.coef_))
|
||
|
pred = est.predict(X)
|
||
|
assert_array_equal(pred, pred_orig)
|
||
|
|
||
|
# pickle and unpickle with sparse coef_
|
||
|
est = pickle.loads(pickle.dumps(est))
|
||
|
assert_true(sparse.issparse(est.coef_))
|
||
|
pred = est.predict(X)
|
||
|
assert_array_equal(pred, pred_orig)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=DeprecationWarning)
|
||
|
def check_classifier_data_not_an_array(name, estimator_orig):
|
||
|
X = np.array([[3, 0], [0, 1], [0, 2], [1, 1], [1, 2], [2, 1]])
|
||
|
y = [1, 1, 1, 2, 2, 2]
|
||
|
y = multioutput_estimator_convert_y_2d(estimator_orig, y)
|
||
|
check_estimators_data_not_an_array(name, estimator_orig, X, y)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=DeprecationWarning)
|
||
|
def check_regressor_data_not_an_array(name, estimator_orig):
|
||
|
X, y = _boston_subset(n_samples=50)
|
||
|
y = multioutput_estimator_convert_y_2d(estimator_orig, y)
|
||
|
check_estimators_data_not_an_array(name, estimator_orig, X, y)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_estimators_data_not_an_array(name, estimator_orig, X, y):
|
||
|
if name in CROSS_DECOMPOSITION:
|
||
|
raise SkipTest
|
||
|
# separate estimators to control random seeds
|
||
|
estimator_1 = clone(estimator_orig)
|
||
|
estimator_2 = clone(estimator_orig)
|
||
|
set_random_state(estimator_1)
|
||
|
set_random_state(estimator_2)
|
||
|
|
||
|
y_ = NotAnArray(np.asarray(y))
|
||
|
X_ = NotAnArray(np.asarray(X))
|
||
|
|
||
|
# fit
|
||
|
estimator_1.fit(X_, y_)
|
||
|
pred1 = estimator_1.predict(X_)
|
||
|
estimator_2.fit(X, y)
|
||
|
pred2 = estimator_2.predict(X)
|
||
|
assert_allclose(pred1, pred2, atol=1e-2, err_msg=name)
|
||
|
|
||
|
|
||
|
def check_parameters_default_constructible(name, Estimator):
|
||
|
# this check works on classes, not instances
|
||
|
classifier = LinearDiscriminantAnalysis()
|
||
|
# test default-constructibility
|
||
|
# get rid of deprecation warnings
|
||
|
with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
|
||
|
if name in META_ESTIMATORS:
|
||
|
estimator = Estimator(classifier)
|
||
|
else:
|
||
|
estimator = Estimator()
|
||
|
# test cloning
|
||
|
clone(estimator)
|
||
|
# test __repr__
|
||
|
repr(estimator)
|
||
|
# test that set_params returns self
|
||
|
assert_true(estimator.set_params() is estimator)
|
||
|
|
||
|
# test if init does nothing but set parameters
|
||
|
# this is important for grid_search etc.
|
||
|
# We get the default parameters from init and then
|
||
|
# compare these against the actual values of the attributes.
|
||
|
|
||
|
# this comes from getattr. Gets rid of deprecation decorator.
|
||
|
init = getattr(estimator.__init__, 'deprecated_original',
|
||
|
estimator.__init__)
|
||
|
|
||
|
try:
|
||
|
def param_filter(p):
|
||
|
"""Identify hyper parameters of an estimator"""
|
||
|
return (p.name != 'self' and
|
||
|
p.kind != p.VAR_KEYWORD and
|
||
|
p.kind != p.VAR_POSITIONAL)
|
||
|
|
||
|
init_params = [p for p in signature(init).parameters.values()
|
||
|
if param_filter(p)]
|
||
|
except (TypeError, ValueError):
|
||
|
# init is not a python function.
|
||
|
# true for mixins
|
||
|
return
|
||
|
params = estimator.get_params()
|
||
|
if name in META_ESTIMATORS:
|
||
|
# they can need a non-default argument
|
||
|
init_params = init_params[1:]
|
||
|
|
||
|
for init_param in init_params:
|
||
|
assert_not_equal(init_param.default, init_param.empty,
|
||
|
"parameter %s for %s has no default value"
|
||
|
% (init_param.name, type(estimator).__name__))
|
||
|
assert_in(type(init_param.default),
|
||
|
[str, int, float, bool, tuple, type(None),
|
||
|
np.float64, types.FunctionType, Memory])
|
||
|
if init_param.name not in params.keys():
|
||
|
# deprecated parameter, not in get_params
|
||
|
assert_true(init_param.default is None)
|
||
|
continue
|
||
|
|
||
|
if (issubclass(Estimator, BaseSGD) and
|
||
|
init_param.name in ['tol', 'max_iter']):
|
||
|
# To remove in 0.21, when they get their future default values
|
||
|
continue
|
||
|
|
||
|
param_value = params[init_param.name]
|
||
|
if isinstance(param_value, np.ndarray):
|
||
|
assert_array_equal(param_value, init_param.default)
|
||
|
else:
|
||
|
assert_equal(param_value, init_param.default, init_param.name)
|
||
|
|
||
|
|
||
|
def multioutput_estimator_convert_y_2d(estimator, y):
|
||
|
# Estimators in mono_output_task_error raise ValueError if y is of 1-D
|
||
|
# Convert into a 2-D y for those estimators.
|
||
|
if "MultiTask" in estimator.__class__.__name__:
|
||
|
return np.reshape(y, (-1, 1))
|
||
|
return y
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_non_transformer_estimators_n_iter(name, estimator_orig):
|
||
|
# Test that estimators that are not transformers with a parameter
|
||
|
# max_iter, return the attribute of n_iter_ at least 1.
|
||
|
|
||
|
# These models are dependent on external solvers like
|
||
|
# libsvm and accessing the iter parameter is non-trivial.
|
||
|
not_run_check_n_iter = ['Ridge', 'SVR', 'NuSVR', 'NuSVC',
|
||
|
'RidgeClassifier', 'SVC', 'RandomizedLasso',
|
||
|
'LogisticRegressionCV', 'LinearSVC',
|
||
|
'LogisticRegression']
|
||
|
|
||
|
# Tested in test_transformer_n_iter
|
||
|
not_run_check_n_iter += CROSS_DECOMPOSITION
|
||
|
if name in not_run_check_n_iter:
|
||
|
return
|
||
|
|
||
|
# LassoLars stops early for the default alpha=1.0 the iris dataset.
|
||
|
if name == 'LassoLars':
|
||
|
estimator = clone(estimator_orig).set_params(alpha=0.)
|
||
|
else:
|
||
|
estimator = clone(estimator_orig)
|
||
|
if hasattr(estimator, 'max_iter'):
|
||
|
iris = load_iris()
|
||
|
X, y_ = iris.data, iris.target
|
||
|
y_ = multioutput_estimator_convert_y_2d(estimator, y_)
|
||
|
|
||
|
set_random_state(estimator, 0)
|
||
|
if name == 'AffinityPropagation':
|
||
|
estimator.fit(X)
|
||
|
else:
|
||
|
estimator.fit(X, y_)
|
||
|
|
||
|
# HuberRegressor depends on scipy.optimize.fmin_l_bfgs_b
|
||
|
# which doesn't return a n_iter for old versions of SciPy.
|
||
|
if not (name == 'HuberRegressor' and estimator.n_iter_ is None):
|
||
|
assert_greater_equal(estimator.n_iter_, 1)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_transformer_n_iter(name, estimator_orig):
|
||
|
# Test that transformers with a parameter max_iter, return the
|
||
|
# attribute of n_iter_ at least 1.
|
||
|
estimator = clone(estimator_orig)
|
||
|
if hasattr(estimator, "max_iter"):
|
||
|
if name in CROSS_DECOMPOSITION:
|
||
|
# Check using default data
|
||
|
X = [[0., 0., 1.], [1., 0., 0.], [2., 2., 2.], [2., 5., 4.]]
|
||
|
y_ = [[0.1, -0.2], [0.9, 1.1], [0.1, -0.5], [0.3, -0.2]]
|
||
|
|
||
|
else:
|
||
|
X, y_ = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
|
||
|
random_state=0, n_features=2, cluster_std=0.1)
|
||
|
X -= X.min() - 0.1
|
||
|
set_random_state(estimator, 0)
|
||
|
estimator.fit(X, y_)
|
||
|
|
||
|
# These return a n_iter per component.
|
||
|
if name in CROSS_DECOMPOSITION:
|
||
|
for iter_ in estimator.n_iter_:
|
||
|
assert_greater_equal(iter_, 1)
|
||
|
else:
|
||
|
assert_greater_equal(estimator.n_iter_, 1)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_get_params_invariance(name, estimator_orig):
|
||
|
# Checks if get_params(deep=False) is a subset of get_params(deep=True)
|
||
|
class T(BaseEstimator):
|
||
|
"""Mock classifier
|
||
|
"""
|
||
|
|
||
|
def __init__(self):
|
||
|
pass
|
||
|
|
||
|
def fit(self, X, y):
|
||
|
return self
|
||
|
|
||
|
def transform(self, X):
|
||
|
return X
|
||
|
|
||
|
e = clone(estimator_orig)
|
||
|
|
||
|
shallow_params = e.get_params(deep=False)
|
||
|
deep_params = e.get_params(deep=True)
|
||
|
|
||
|
assert_true(all(item in deep_params.items() for item in
|
||
|
shallow_params.items()))
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_classifiers_regression_target(name, estimator_orig):
|
||
|
# Check if classifier throws an exception when fed regression targets
|
||
|
|
||
|
boston = load_boston()
|
||
|
X, y = boston.data, boston.target
|
||
|
e = clone(estimator_orig)
|
||
|
msg = 'Unknown label type: '
|
||
|
assert_raises_regex(ValueError, msg, e.fit, X, y)
|
||
|
|
||
|
|
||
|
@ignore_warnings(category=(DeprecationWarning, FutureWarning))
|
||
|
def check_decision_proba_consistency(name, estimator_orig):
|
||
|
# Check whether an estimator having both decision_function and
|
||
|
# predict_proba methods has outputs with perfect rank correlation.
|
||
|
|
||
|
centers = [(2, 2), (4, 4)]
|
||
|
X, y = make_blobs(n_samples=100, random_state=0, n_features=4,
|
||
|
centers=centers, cluster_std=1.0, shuffle=True)
|
||
|
X_test = np.random.randn(20, 2) + 4
|
||
|
estimator = clone(estimator_orig)
|
||
|
|
||
|
if (hasattr(estimator, "decision_function") and
|
||
|
hasattr(estimator, "predict_proba")):
|
||
|
|
||
|
estimator.fit(X, y)
|
||
|
a = estimator.predict_proba(X_test)[:, 1]
|
||
|
b = estimator.decision_function(X_test)
|
||
|
assert_array_equal(rankdata(a), rankdata(b))
|