""" Testing Recursive feature elimination """ import numpy as np from numpy.testing import assert_array_almost_equal, assert_array_equal from scipy import sparse from sklearn.feature_selection.rfe import RFE, RFECV from sklearn.datasets import load_iris, make_friedman1 from sklearn.metrics import zero_one_loss from sklearn.svm import SVC, SVR from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score from sklearn.utils import check_random_state from sklearn.utils.testing import ignore_warnings from sklearn.utils.testing import assert_greater, assert_equal, assert_true from sklearn.metrics import make_scorer from sklearn.metrics import get_scorer class MockClassifier(object): """ Dummy classifier to test recursive feature elimination """ def __init__(self, foo_param=0): self.foo_param = foo_param def fit(self, X, Y): assert_true(len(X) == len(Y)) self.coef_ = np.ones(X.shape[1], dtype=np.float64) return self def predict(self, T): return T.shape[0] predict_proba = predict decision_function = predict transform = predict def score(self, X=None, Y=None): if self.foo_param > 1: score = 1. else: score = 0. return score def get_params(self, deep=True): return {'foo_param': self.foo_param} def set_params(self, **params): return self def test_rfe_features_importance(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target clf = RandomForestClassifier(n_estimators=20, random_state=generator, max_depth=2) rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) rfe.fit(X, y) assert_equal(len(rfe.ranking_), X.shape[1]) clf_svc = SVC(kernel="linear") rfe_svc = RFE(estimator=clf_svc, n_features_to_select=4, step=0.1) rfe_svc.fit(X, y) # Check if the supports are equal assert_array_equal(rfe.get_support(), rfe_svc.get_support()) def test_rfe(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] X_sparse = sparse.csr_matrix(X) y = iris.target # dense model clf = SVC(kernel="linear") rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) rfe.fit(X, y) X_r = rfe.transform(X) clf.fit(X_r, y) assert_equal(len(rfe.ranking_), X.shape[1]) # sparse model clf_sparse = SVC(kernel="linear") rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1) rfe_sparse.fit(X_sparse, y) X_r_sparse = rfe_sparse.transform(X_sparse) assert_equal(X_r.shape, iris.data.shape) assert_array_almost_equal(X_r[:10], iris.data[:10]) assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data)) assert_equal(rfe.score(X, y), clf.score(iris.data, iris.target)) assert_array_almost_equal(X_r, X_r_sparse.toarray()) def test_rfe_mockclassifier(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target # dense model clf = MockClassifier() rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1) rfe.fit(X, y) X_r = rfe.transform(X) clf.fit(X_r, y) assert_equal(len(rfe.ranking_), X.shape[1]) assert_equal(X_r.shape, iris.data.shape) def test_rfecv(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.grid_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) # All the noisy variable were filtered out assert_array_equal(X_r, iris.data) # same in sparse rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Test using a customized loss function scoring = make_scorer(zero_one_loss, greater_is_better=False) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scoring) ignore_warnings(rfecv.fit)(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test using a scorer scorer = get_scorer('accuracy') rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=scorer) rfecv.fit(X, y) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) # Test fix on grid_scores def test_scorer(estimator, X, y): return 1.0 rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, scoring=test_scorer) rfecv.fit(X, y) assert_array_equal(rfecv.grid_scores_, np.ones(len(rfecv.grid_scores_))) # Same as the first two tests, but with step=2 rfecv = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5) rfecv.fit(X, y) assert_equal(len(rfecv.grid_scores_), 6) assert_equal(len(rfecv.ranking_), X.shape[1]) X_r = rfecv.transform(X) assert_array_equal(X_r, iris.data) rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) # Verifying that steps < 1 don't blow up. rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=.2, cv=5) X_sparse = sparse.csr_matrix(X) rfecv_sparse.fit(X_sparse, y) X_r_sparse = rfecv_sparse.transform(X_sparse) assert_array_equal(X_r_sparse.toarray(), iris.data) def test_rfecv_mockclassifier(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) # regression test: list should be supported # Test using the score function rfecv = RFECV(estimator=MockClassifier(), step=1, cv=5) rfecv.fit(X, y) # non-regression test for missing worst feature: assert_equal(len(rfecv.grid_scores_), X.shape[1]) assert_equal(len(rfecv.ranking_), X.shape[1]) def test_rfecv_verbose_output(): # Check verbose=1 is producing an output. from sklearn.externals.six.moves import cStringIO as StringIO import sys sys.stdout = StringIO() generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = list(iris.target) rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, cv=5, verbose=1) rfecv.fit(X, y) verbose_output = sys.stdout verbose_output.seek(0) assert_greater(len(verbose_output.readline()), 0) def test_rfe_estimator_tags(): rfe = RFE(SVC(kernel='linear')) assert_equal(rfe._estimator_type, "classifier") # make sure that cross-validation is stratified iris = load_iris() score = cross_val_score(rfe, iris.data, iris.target) assert_greater(score.min(), .7) def test_rfe_min_step(): n_features = 10 X, y = make_friedman1(n_samples=50, n_features=n_features, random_state=0) n_samples, n_features = X.shape estimator = SVR(kernel="linear") # Test when floor(step * n_features) <= 0 selector = RFE(estimator, step=0.01) sel = selector.fit(X, y) assert_equal(sel.support_.sum(), n_features // 2) # Test when step is between (0,1) and floor(step * n_features) > 0 selector = RFE(estimator, step=0.20) sel = selector.fit(X, y) assert_equal(sel.support_.sum(), n_features // 2) # Test when step is an integer selector = RFE(estimator, step=5) sel = selector.fit(X, y) assert_equal(sel.support_.sum(), n_features // 2) def test_number_of_subsets_of_features(): # In RFE, 'number_of_subsets_of_features' # = the number of iterations in '_fit' # = max(ranking_) # = 1 + (n_features + step - n_features_to_select - 1) // step # After optimization #4534, this number # = 1 + np.ceil((n_features - n_features_to_select) / float(step)) # This test case is to test their equivalence, refer to #4534 and #3824 def formula1(n_features, n_features_to_select, step): return 1 + ((n_features + step - n_features_to_select - 1) // step) def formula2(n_features, n_features_to_select, step): return 1 + np.ceil((n_features - n_features_to_select) / float(step)) # RFE # Case 1, n_features - n_features_to_select is divisible by step # Case 2, n_features - n_features_to_select is not divisible by step n_features_list = [11, 11] n_features_to_select_list = [3, 3] step_list = [2, 3] for n_features, n_features_to_select, step in zip( n_features_list, n_features_to_select_list, step_list): generator = check_random_state(43) X = generator.normal(size=(100, n_features)) y = generator.rand(100).round() rfe = RFE(estimator=SVC(kernel="linear"), n_features_to_select=n_features_to_select, step=step) rfe.fit(X, y) # this number also equals to the maximum of ranking_ assert_equal(np.max(rfe.ranking_), formula1(n_features, n_features_to_select, step)) assert_equal(np.max(rfe.ranking_), formula2(n_features, n_features_to_select, step)) # In RFECV, 'fit' calls 'RFE._fit' # 'number_of_subsets_of_features' of RFE # = the size of 'grid_scores' of RFECV # = the number of iterations of the for loop before optimization #4534 # RFECV, n_features_to_select = 1 # Case 1, n_features - 1 is divisible by step # Case 2, n_features - 1 is not divisible by step n_features_to_select = 1 n_features_list = [11, 10] step_list = [2, 2] for n_features, step in zip(n_features_list, step_list): generator = check_random_state(43) X = generator.normal(size=(100, n_features)) y = generator.rand(100).round() rfecv = RFECV(estimator=SVC(kernel="linear"), step=step, cv=5) rfecv.fit(X, y) assert_equal(rfecv.grid_scores_.shape[0], formula1(n_features, n_features_to_select, step)) assert_equal(rfecv.grid_scores_.shape[0], formula2(n_features, n_features_to_select, step)) def test_rfe_cv_n_jobs(): generator = check_random_state(0) iris = load_iris() X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))] y = iris.target rfecv = RFECV(estimator=SVC(kernel='linear')) rfecv.fit(X, y) rfecv_ranking = rfecv.ranking_ rfecv_grid_scores = rfecv.grid_scores_ rfecv.set_params(n_jobs=2) rfecv.fit(X, y) assert_array_almost_equal(rfecv.ranking_, rfecv_ranking) assert_array_almost_equal(rfecv.grid_scores_, rfecv_grid_scores)