import pickle import unittest import numpy as np import scipy.sparse as sp from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_less from sklearn.utils.testing import raises from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_false, assert_true from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_raises_regexp from sklearn.utils.testing import assert_warns from sklearn.utils.testing import assert_warns_message from sklearn.utils.testing import assert_no_warnings from sklearn.utils.testing import ignore_warnings from sklearn import linear_model, datasets, metrics from sklearn.base import clone from sklearn.linear_model import SGDClassifier, SGDRegressor from sklearn.preprocessing import LabelEncoder, scale, MinMaxScaler from sklearn.preprocessing import StandardScaler from sklearn.exceptions import ConvergenceWarning from sklearn.linear_model import sgd_fast class SparseSGDClassifier(SGDClassifier): def fit(self, X, y, *args, **kw): X = sp.csr_matrix(X) return super(SparseSGDClassifier, self).fit(X, y, *args, **kw) def partial_fit(self, X, y, *args, **kw): X = sp.csr_matrix(X) return super(SparseSGDClassifier, self).partial_fit(X, y, *args, **kw) def decision_function(self, X): X = sp.csr_matrix(X) return super(SparseSGDClassifier, self).decision_function(X) def predict_proba(self, X): X = sp.csr_matrix(X) return super(SparseSGDClassifier, self).predict_proba(X) class SparseSGDRegressor(SGDRegressor): def fit(self, X, y, *args, **kw): X = sp.csr_matrix(X) return SGDRegressor.fit(self, X, y, *args, **kw) def partial_fit(self, X, y, *args, **kw): X = sp.csr_matrix(X) return SGDRegressor.partial_fit(self, X, y, *args, **kw) def decision_function(self, X, *args, **kw): X = sp.csr_matrix(X) return SGDRegressor.decision_function(self, X, *args, **kw) # Test Data # test sample 1 X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]) Y = [1, 1, 1, 2, 2, 2] T = np.array([[-1, -1], [2, 2], [3, 2]]) true_result = [1, 2, 2] # test sample 2; string class labels X2 = np.array([[-1, 1], [-0.75, 0.5], [-1.5, 1.5], [1, 1], [0.75, 0.5], [1.5, 1.5], [-1, -1], [0, -0.5], [1, -1]]) Y2 = ["one"] * 3 + ["two"] * 3 + ["three"] * 3 T2 = np.array([[-1.5, 0.5], [1, 2], [0, -2]]) true_result2 = ["one", "two", "three"] # test sample 3 X3 = np.array([[1, 1, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 1, 1], [0, 0, 0, 1, 0, 0], [0, 0, 0, 1, 0, 0]]) Y3 = np.array([1, 1, 1, 1, 2, 2, 2, 2]) # test sample 4 - two more or less redundant feature groups X4 = np.array([[1, 0.9, 0.8, 0, 0, 0], [1, .84, .98, 0, 0, 0], [1, .96, .88, 0, 0, 0], [1, .91, .99, 0, 0, 0], [0, 0, 0, .89, .91, 1], [0, 0, 0, .79, .84, 1], [0, 0, 0, .91, .95, 1], [0, 0, 0, .93, 1, 1]]) Y4 = np.array([1, 1, 1, 1, 2, 2, 2, 2]) iris = datasets.load_iris() # test sample 5 - test sample 1 as binary classification problem X5 = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]) Y5 = [1, 1, 1, 2, 2, 2] true_result5 = [0, 1, 1] # Classification Test Case class CommonTest(object): def factory(self, **kwargs): if "random_state" not in kwargs: kwargs["random_state"] = 42 if "tol" not in kwargs: kwargs["tol"] = None if "max_iter" not in kwargs: kwargs["max_iter"] = 5 return self.factory_class(**kwargs) # a simple implementation of ASGD to use for testing # uses squared loss to find the gradient def asgd(self, X, y, eta, alpha, weight_init=None, intercept_init=0.0): if weight_init is None: weights = np.zeros(X.shape[1]) else: weights = weight_init average_weights = np.zeros(X.shape[1]) intercept = intercept_init average_intercept = 0.0 decay = 1.0 # sparse data has a fixed decay of .01 if (isinstance(self, SparseSGDClassifierTestCase) or isinstance(self, SparseSGDRegressorTestCase)): decay = .01 for i, entry in enumerate(X): p = np.dot(entry, weights) p += intercept gradient = p - y[i] weights *= 1.0 - (eta * alpha) weights += -(eta * gradient * entry) intercept += -(eta * gradient) * decay average_weights *= i average_weights += weights average_weights /= i + 1.0 average_intercept *= i average_intercept += intercept average_intercept /= i + 1.0 return average_weights, average_intercept def _test_warm_start(self, X, Y, lr): # Test that explicit warm restart... clf = self.factory(alpha=0.01, eta0=0.01, shuffle=False, learning_rate=lr) clf.fit(X, Y) clf2 = self.factory(alpha=0.001, eta0=0.01, shuffle=False, learning_rate=lr) clf2.fit(X, Y, coef_init=clf.coef_.copy(), intercept_init=clf.intercept_.copy()) # ... and implicit warm restart are equivalent. clf3 = self.factory(alpha=0.01, eta0=0.01, shuffle=False, warm_start=True, learning_rate=lr) clf3.fit(X, Y) assert_equal(clf3.t_, clf.t_) assert_array_almost_equal(clf3.coef_, clf.coef_) clf3.set_params(alpha=0.001) clf3.fit(X, Y) assert_equal(clf3.t_, clf2.t_) assert_array_almost_equal(clf3.coef_, clf2.coef_) def test_warm_start_constant(self): self._test_warm_start(X, Y, "constant") def test_warm_start_invscaling(self): self._test_warm_start(X, Y, "invscaling") def test_warm_start_optimal(self): self._test_warm_start(X, Y, "optimal") def test_input_format(self): # Input format tests. clf = self.factory(alpha=0.01, shuffle=False) clf.fit(X, Y) Y_ = np.array(Y)[:, np.newaxis] Y_ = np.c_[Y_, Y_] assert_raises(ValueError, clf.fit, X, Y_) def test_clone(self): # Test whether clone works ok. clf = self.factory(alpha=0.01, penalty='l1') clf = clone(clf) clf.set_params(penalty='l2') clf.fit(X, Y) clf2 = self.factory(alpha=0.01, penalty='l2') clf2.fit(X, Y) assert_array_equal(clf.coef_, clf2.coef_) def test_plain_has_no_average_attr(self): clf = self.factory(average=True, eta0=.01) clf.fit(X, Y) assert_true(hasattr(clf, 'average_coef_')) assert_true(hasattr(clf, 'average_intercept_')) assert_true(hasattr(clf, 'standard_intercept_')) assert_true(hasattr(clf, 'standard_coef_')) clf = self.factory() clf.fit(X, Y) assert_false(hasattr(clf, 'average_coef_')) assert_false(hasattr(clf, 'average_intercept_')) assert_false(hasattr(clf, 'standard_intercept_')) assert_false(hasattr(clf, 'standard_coef_')) def test_late_onset_averaging_not_reached(self): clf1 = self.factory(average=600) clf2 = self.factory() for _ in range(100): if isinstance(clf1, SGDClassifier): clf1.partial_fit(X, Y, classes=np.unique(Y)) clf2.partial_fit(X, Y, classes=np.unique(Y)) else: clf1.partial_fit(X, Y) clf2.partial_fit(X, Y) assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=16) assert_almost_equal(clf1.intercept_, clf2.intercept_, decimal=16) def test_late_onset_averaging_reached(self): eta0 = .001 alpha = .0001 Y_encode = np.array(Y) Y_encode[Y_encode == 1] = -1.0 Y_encode[Y_encode == 2] = 1.0 clf1 = self.factory(average=7, learning_rate="constant", loss='squared_loss', eta0=eta0, alpha=alpha, max_iter=2, shuffle=False) clf2 = self.factory(average=0, learning_rate="constant", loss='squared_loss', eta0=eta0, alpha=alpha, max_iter=1, shuffle=False) clf1.fit(X, Y_encode) clf2.fit(X, Y_encode) average_weights, average_intercept = \ self.asgd(X, Y_encode, eta0, alpha, weight_init=clf2.coef_.ravel(), intercept_init=clf2.intercept_) assert_array_almost_equal(clf1.coef_.ravel(), average_weights.ravel(), decimal=16) assert_almost_equal(clf1.intercept_, average_intercept, decimal=16) @raises(ValueError) def test_sgd_bad_alpha_for_optimal_learning_rate(self): # Check whether expected ValueError on bad alpha, i.e. 0 # since alpha is used to compute the optimal learning rate self.factory(alpha=0, learning_rate="optimal") class DenseSGDClassifierTestCase(unittest.TestCase, CommonTest): """Test suite for the dense representation variant of SGD""" factory_class = SGDClassifier def test_sgd(self): # Check that SGD gives any results :-) for loss in ("hinge", "squared_hinge", "log", "modified_huber"): clf = self.factory(penalty='l2', alpha=0.01, fit_intercept=True, loss=loss, max_iter=10, shuffle=True) clf.fit(X, Y) # assert_almost_equal(clf.coef_[0], clf.coef_[1], decimal=7) assert_array_equal(clf.predict(T), true_result) @raises(ValueError) def test_sgd_bad_l1_ratio(self): # Check whether expected ValueError on bad l1_ratio self.factory(l1_ratio=1.1) @raises(ValueError) def test_sgd_bad_learning_rate_schedule(self): # Check whether expected ValueError on bad learning_rate self.factory(learning_rate="") @raises(ValueError) def test_sgd_bad_eta0(self): # Check whether expected ValueError on bad eta0 self.factory(eta0=0, learning_rate="constant") @raises(ValueError) def test_sgd_bad_alpha(self): # Check whether expected ValueError on bad alpha self.factory(alpha=-.1) @raises(ValueError) def test_sgd_bad_penalty(self): # Check whether expected ValueError on bad penalty self.factory(penalty='foobar', l1_ratio=0.85) @raises(ValueError) def test_sgd_bad_loss(self): # Check whether expected ValueError on bad loss self.factory(loss="foobar") @raises(ValueError) def test_sgd_max_iter_param(self): # Test parameter validity check self.factory(max_iter=-10000) @raises(ValueError) def test_sgd_shuffle_param(self): # Test parameter validity check self.factory(shuffle="false") @raises(TypeError) def test_argument_coef(self): # Checks coef_init not allowed as model argument (only fit) # Provided coef_ does not match dataset. self.factory(coef_init=np.zeros((3,))).fit(X, Y) @raises(ValueError) def test_provide_coef(self): # Checks coef_init shape for the warm starts # Provided coef_ does not match dataset. self.factory().fit(X, Y, coef_init=np.zeros((3,))) @raises(ValueError) def test_set_intercept(self): # Checks intercept_ shape for the warm starts # Provided intercept_ does not match dataset. self.factory().fit(X, Y, intercept_init=np.zeros((3,))) def test_set_intercept_binary(self): # Checks intercept_ shape for the warm starts in binary case self.factory().fit(X5, Y5, intercept_init=0) def test_average_binary_computed_correctly(self): # Checks the SGDClassifier correctly computes the average weights eta = .1 alpha = 2. n_samples = 20 n_features = 10 rng = np.random.RandomState(0) X = rng.normal(size=(n_samples, n_features)) w = rng.normal(size=n_features) clf = self.factory(loss='squared_loss', learning_rate='constant', eta0=eta, alpha=alpha, fit_intercept=True, max_iter=1, average=True, shuffle=False) # simple linear function without noise y = np.dot(X, w) y = np.sign(y) clf.fit(X, y) average_weights, average_intercept = self.asgd(X, y, eta, alpha) average_weights = average_weights.reshape(1, -1) assert_array_almost_equal(clf.coef_, average_weights, decimal=14) assert_almost_equal(clf.intercept_, average_intercept, decimal=14) def test_set_intercept_to_intercept(self): # Checks intercept_ shape consistency for the warm starts # Inconsistent intercept_ shape. clf = self.factory().fit(X5, Y5) self.factory().fit(X5, Y5, intercept_init=clf.intercept_) clf = self.factory().fit(X, Y) self.factory().fit(X, Y, intercept_init=clf.intercept_) @raises(ValueError) def test_sgd_at_least_two_labels(self): # Target must have at least two labels self.factory(alpha=0.01, max_iter=20).fit(X2, np.ones(9)) def test_partial_fit_weight_class_balanced(self): # partial_fit with class_weight='balanced' not supported""" assert_raises_regexp(ValueError, "class_weight 'balanced' is not supported for " "partial_fit. In order to use 'balanced' weights, " "use compute_class_weight\('balanced', classes, y\). " "In place of y you can us a large enough sample " "of the full training set target to properly " "estimate the class frequency distributions. " "Pass the resulting weights as the class_weight " "parameter.", self.factory(class_weight='balanced').partial_fit, X, Y, classes=np.unique(Y)) def test_sgd_multiclass(self): # Multi-class test case clf = self.factory(alpha=0.01, max_iter=20).fit(X2, Y2) assert_equal(clf.coef_.shape, (3, 2)) assert_equal(clf.intercept_.shape, (3,)) assert_equal(clf.decision_function([[0, 0]]).shape, (1, 3)) pred = clf.predict(T2) assert_array_equal(pred, true_result2) def test_sgd_multiclass_average(self): eta = .001 alpha = .01 # Multi-class average test case clf = self.factory(loss='squared_loss', learning_rate='constant', eta0=eta, alpha=alpha, fit_intercept=True, max_iter=1, average=True, shuffle=False) np_Y2 = np.array(Y2) clf.fit(X2, np_Y2) classes = np.unique(np_Y2) for i, cl in enumerate(classes): y_i = np.ones(np_Y2.shape[0]) y_i[np_Y2 != cl] = -1 average_coef, average_intercept = self.asgd(X2, y_i, eta, alpha) assert_array_almost_equal(average_coef, clf.coef_[i], decimal=16) assert_almost_equal(average_intercept, clf.intercept_[i], decimal=16) def test_sgd_multiclass_with_init_coef(self): # Multi-class test case clf = self.factory(alpha=0.01, max_iter=20) clf.fit(X2, Y2, coef_init=np.zeros((3, 2)), intercept_init=np.zeros(3)) assert_equal(clf.coef_.shape, (3, 2)) assert_true(clf.intercept_.shape, (3,)) pred = clf.predict(T2) assert_array_equal(pred, true_result2) def test_sgd_multiclass_njobs(self): # Multi-class test case with multi-core support clf = self.factory(alpha=0.01, max_iter=20, n_jobs=2).fit(X2, Y2) assert_equal(clf.coef_.shape, (3, 2)) assert_equal(clf.intercept_.shape, (3,)) assert_equal(clf.decision_function([[0, 0]]).shape, (1, 3)) pred = clf.predict(T2) assert_array_equal(pred, true_result2) def test_set_coef_multiclass(self): # Checks coef_init and intercept_init shape for multi-class # problems # Provided coef_ does not match dataset clf = self.factory() assert_raises(ValueError, clf.fit, X2, Y2, coef_init=np.zeros((2, 2))) # Provided coef_ does match dataset clf = self.factory().fit(X2, Y2, coef_init=np.zeros((3, 2))) # Provided intercept_ does not match dataset clf = self.factory() assert_raises(ValueError, clf.fit, X2, Y2, intercept_init=np.zeros((1,))) # Provided intercept_ does match dataset. clf = self.factory().fit(X2, Y2, intercept_init=np.zeros((3,))) def test_sgd_proba(self): # Check SGD.predict_proba # Hinge loss does not allow for conditional prob estimate. # We cannot use the factory here, because it defines predict_proba # anyway. clf = SGDClassifier(loss="hinge", alpha=0.01, max_iter=10, tol=None).fit(X, Y) assert_false(hasattr(clf, "predict_proba")) assert_false(hasattr(clf, "predict_log_proba")) # log and modified_huber losses can output probability estimates # binary case for loss in ["log", "modified_huber"]: clf = self.factory(loss=loss, alpha=0.01, max_iter=10) clf.fit(X, Y) p = clf.predict_proba([[3, 2]]) assert_true(p[0, 1] > 0.5) p = clf.predict_proba([[-1, -1]]) assert_true(p[0, 1] < 0.5) p = clf.predict_log_proba([[3, 2]]) assert_true(p[0, 1] > p[0, 0]) p = clf.predict_log_proba([[-1, -1]]) assert_true(p[0, 1] < p[0, 0]) # log loss multiclass probability estimates clf = self.factory(loss="log", alpha=0.01, max_iter=10).fit(X2, Y2) d = clf.decision_function([[.1, -.1], [.3, .2]]) p = clf.predict_proba([[.1, -.1], [.3, .2]]) assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1)) assert_almost_equal(p[0].sum(), 1) assert_true(np.all(p[0] >= 0)) p = clf.predict_proba([[-1, -1]]) d = clf.decision_function([[-1, -1]]) assert_array_equal(np.argsort(p[0]), np.argsort(d[0])) l = clf.predict_log_proba([[3, 2]]) p = clf.predict_proba([[3, 2]]) assert_array_almost_equal(np.log(p), l) l = clf.predict_log_proba([[-1, -1]]) p = clf.predict_proba([[-1, -1]]) assert_array_almost_equal(np.log(p), l) # Modified Huber multiclass probability estimates; requires a separate # test because the hard zero/one probabilities may destroy the # ordering present in decision_function output. clf = self.factory(loss="modified_huber", alpha=0.01, max_iter=10) clf.fit(X2, Y2) d = clf.decision_function([[3, 2]]) p = clf.predict_proba([[3, 2]]) if not isinstance(self, SparseSGDClassifierTestCase): assert_equal(np.argmax(d, axis=1), np.argmax(p, axis=1)) else: # XXX the sparse test gets a different X2 (?) assert_equal(np.argmin(d, axis=1), np.argmin(p, axis=1)) # the following sample produces decision_function values < -1, # which would cause naive normalization to fail (see comment # in SGDClassifier.predict_proba) x = X.mean(axis=0) d = clf.decision_function([x]) if np.all(d < -1): # XXX not true in sparse test case (why?) p = clf.predict_proba([x]) assert_array_almost_equal(p[0], [1 / 3.] * 3) def test_sgd_l1(self): # Test L1 regularization n = len(X4) rng = np.random.RandomState(13) idx = np.arange(n) rng.shuffle(idx) X = X4[idx, :] Y = Y4[idx] clf = self.factory(penalty='l1', alpha=.2, fit_intercept=False, max_iter=2000, tol=None, shuffle=False) clf.fit(X, Y) assert_array_equal(clf.coef_[0, 1:-1], np.zeros((4,))) pred = clf.predict(X) assert_array_equal(pred, Y) # test sparsify with dense inputs clf.sparsify() assert_true(sp.issparse(clf.coef_)) pred = clf.predict(X) assert_array_equal(pred, Y) # pickle and unpickle with sparse coef_ clf = pickle.loads(pickle.dumps(clf)) assert_true(sp.issparse(clf.coef_)) pred = clf.predict(X) assert_array_equal(pred, Y) def test_class_weights(self): # Test class weights. X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = [1, 1, 1, -1, -1] clf = self.factory(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight=None) clf.fit(X, y) assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) # we give a small weights to class 1 clf = self.factory(alpha=0.1, max_iter=1000, fit_intercept=False, class_weight={1: 0.001}) clf.fit(X, y) # now the hyperplane should rotate clock-wise and # the prediction on this point should shift assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) def test_equal_class_weight(self): # Test if equal class weights approx. equals no class weights. X = [[1, 0], [1, 0], [0, 1], [0, 1]] y = [0, 0, 1, 1] clf = self.factory(alpha=0.1, max_iter=1000, class_weight=None) clf.fit(X, y) X = [[1, 0], [0, 1]] y = [0, 1] clf_weighted = self.factory(alpha=0.1, max_iter=1000, class_weight={0: 0.5, 1: 0.5}) clf_weighted.fit(X, y) # should be similar up to some epsilon due to learning rate schedule assert_almost_equal(clf.coef_, clf_weighted.coef_, decimal=2) @raises(ValueError) def test_wrong_class_weight_label(self): # ValueError due to not existing class label. clf = self.factory(alpha=0.1, max_iter=1000, class_weight={0: 0.5}) clf.fit(X, Y) @raises(ValueError) def test_wrong_class_weight_format(self): # ValueError due to wrong class_weight argument type. clf = self.factory(alpha=0.1, max_iter=1000, class_weight=[0.5]) clf.fit(X, Y) def test_weights_multiplied(self): # Tests that class_weight and sample_weight are multiplicative class_weights = {1: .6, 2: .3} rng = np.random.RandomState(0) sample_weights = rng.random_sample(Y4.shape[0]) multiplied_together = np.copy(sample_weights) multiplied_together[Y4 == 1] *= class_weights[1] multiplied_together[Y4 == 2] *= class_weights[2] clf1 = self.factory(alpha=0.1, max_iter=20, class_weight=class_weights) clf2 = self.factory(alpha=0.1, max_iter=20) clf1.fit(X4, Y4, sample_weight=sample_weights) clf2.fit(X4, Y4, sample_weight=multiplied_together) assert_almost_equal(clf1.coef_, clf2.coef_) def test_balanced_weight(self): # Test class weights for imbalanced data""" # compute reference metrics on iris dataset that is quite balanced by # default X, y = iris.data, iris.target X = scale(X) idx = np.arange(X.shape[0]) rng = np.random.RandomState(6) rng.shuffle(idx) X = X[idx] y = y[idx] clf = self.factory(alpha=0.0001, max_iter=1000, class_weight=None, shuffle=False).fit(X, y) f1 = metrics.f1_score(y, clf.predict(X), average='weighted') assert_almost_equal(f1, 0.96, decimal=1) # make the same prediction using balanced class_weight clf_balanced = self.factory(alpha=0.0001, max_iter=1000, class_weight="balanced", shuffle=False).fit(X, y) f1 = metrics.f1_score(y, clf_balanced.predict(X), average='weighted') assert_almost_equal(f1, 0.96, decimal=1) # Make sure that in the balanced case it does not change anything # to use "balanced" assert_array_almost_equal(clf.coef_, clf_balanced.coef_, 6) # build an very very imbalanced dataset out of iris data X_0 = X[y == 0, :] y_0 = y[y == 0] X_imbalanced = np.vstack([X] + [X_0] * 10) y_imbalanced = np.concatenate([y] + [y_0] * 10) # fit a model on the imbalanced data without class weight info clf = self.factory(max_iter=1000, class_weight=None, shuffle=False) clf.fit(X_imbalanced, y_imbalanced) y_pred = clf.predict(X) assert_less(metrics.f1_score(y, y_pred, average='weighted'), 0.96) # fit a model with balanced class_weight enabled clf = self.factory(max_iter=1000, class_weight="balanced", shuffle=False) clf.fit(X_imbalanced, y_imbalanced) y_pred = clf.predict(X) assert_greater(metrics.f1_score(y, y_pred, average='weighted'), 0.96) def test_sample_weights(self): # Test weights on individual samples X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = [1, 1, 1, -1, -1] clf = self.factory(alpha=0.1, max_iter=1000, fit_intercept=False) clf.fit(X, y) assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) # we give a small weights to class 1 clf.fit(X, y, sample_weight=[0.001] * 3 + [1] * 2) # now the hyperplane should rotate clock-wise and # the prediction on this point should shift assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) @raises(ValueError) def test_wrong_sample_weights(self): # Test if ValueError is raised if sample_weight has wrong shape clf = self.factory(alpha=0.1, max_iter=1000, fit_intercept=False) # provided sample_weight too long clf.fit(X, Y, sample_weight=np.arange(7)) @raises(ValueError) def test_partial_fit_exception(self): clf = self.factory(alpha=0.01) # classes was not specified clf.partial_fit(X3, Y3) def test_partial_fit_binary(self): third = X.shape[0] // 3 clf = self.factory(alpha=0.01) classes = np.unique(Y) clf.partial_fit(X[:third], Y[:third], classes=classes) assert_equal(clf.coef_.shape, (1, X.shape[1])) assert_equal(clf.intercept_.shape, (1,)) assert_equal(clf.decision_function([[0, 0]]).shape, (1, )) id1 = id(clf.coef_.data) clf.partial_fit(X[third:], Y[third:]) id2 = id(clf.coef_.data) # check that coef_ haven't been re-allocated assert_true(id1, id2) y_pred = clf.predict(T) assert_array_equal(y_pred, true_result) def test_partial_fit_multiclass(self): third = X2.shape[0] // 3 clf = self.factory(alpha=0.01) classes = np.unique(Y2) clf.partial_fit(X2[:third], Y2[:third], classes=classes) assert_equal(clf.coef_.shape, (3, X2.shape[1])) assert_equal(clf.intercept_.shape, (3,)) assert_equal(clf.decision_function([[0, 0]]).shape, (1, 3)) id1 = id(clf.coef_.data) clf.partial_fit(X2[third:], Y2[third:]) id2 = id(clf.coef_.data) # check that coef_ haven't been re-allocated assert_true(id1, id2) def test_partial_fit_multiclass_average(self): third = X2.shape[0] // 3 clf = self.factory(alpha=0.01, average=X2.shape[0]) classes = np.unique(Y2) clf.partial_fit(X2[:third], Y2[:third], classes=classes) assert_equal(clf.coef_.shape, (3, X2.shape[1])) assert_equal(clf.intercept_.shape, (3,)) clf.partial_fit(X2[third:], Y2[third:]) assert_equal(clf.coef_.shape, (3, X2.shape[1])) assert_equal(clf.intercept_.shape, (3,)) def test_fit_then_partial_fit(self): # Partial_fit should work after initial fit in the multiclass case. # Non-regression test for #2496; fit would previously produce a # Fortran-ordered coef_ that subsequent partial_fit couldn't handle. clf = self.factory() clf.fit(X2, Y2) clf.partial_fit(X2, Y2) # no exception here def _test_partial_fit_equal_fit(self, lr): for X_, Y_, T_ in ((X, Y, T), (X2, Y2, T2)): clf = self.factory(alpha=0.01, eta0=0.01, max_iter=2, learning_rate=lr, shuffle=False) clf.fit(X_, Y_) y_pred = clf.decision_function(T_) t = clf.t_ classes = np.unique(Y_) clf = self.factory(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False) for i in range(2): clf.partial_fit(X_, Y_, classes=classes) y_pred2 = clf.decision_function(T_) assert_equal(clf.t_, t) assert_array_almost_equal(y_pred, y_pred2, decimal=2) def test_partial_fit_equal_fit_constant(self): self._test_partial_fit_equal_fit("constant") def test_partial_fit_equal_fit_optimal(self): self._test_partial_fit_equal_fit("optimal") def test_partial_fit_equal_fit_invscaling(self): self._test_partial_fit_equal_fit("invscaling") def test_regression_losses(self): clf = self.factory(alpha=0.01, learning_rate="constant", eta0=0.1, loss="epsilon_insensitive") clf.fit(X, Y) assert_equal(1.0, np.mean(clf.predict(X) == Y)) clf = self.factory(alpha=0.01, learning_rate="constant", eta0=0.1, loss="squared_epsilon_insensitive") clf.fit(X, Y) assert_equal(1.0, np.mean(clf.predict(X) == Y)) clf = self.factory(alpha=0.01, loss="huber") clf.fit(X, Y) assert_equal(1.0, np.mean(clf.predict(X) == Y)) clf = self.factory(alpha=0.01, learning_rate="constant", eta0=0.01, loss="squared_loss") clf.fit(X, Y) assert_equal(1.0, np.mean(clf.predict(X) == Y)) def test_warm_start_multiclass(self): self._test_warm_start(X2, Y2, "optimal") def test_multiple_fit(self): # Test multiple calls of fit w/ different shaped inputs. clf = self.factory(alpha=0.01, shuffle=False) clf.fit(X, Y) assert_true(hasattr(clf, "coef_")) # Non-regression test: try fitting with a different label set. y = [["ham", "spam"][i] for i in LabelEncoder().fit_transform(Y)] clf.fit(X[:, :-1], y) class SparseSGDClassifierTestCase(DenseSGDClassifierTestCase): """Run exactly the same tests using the sparse representation variant""" factory_class = SparseSGDClassifier ############################################################################### # Regression Test Case class DenseSGDRegressorTestCase(unittest.TestCase, CommonTest): """Test suite for the dense representation variant of SGD""" factory_class = SGDRegressor def test_sgd(self): # Check that SGD gives any results. clf = self.factory(alpha=0.1, max_iter=2, fit_intercept=False) clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2]) assert_equal(clf.coef_[0], clf.coef_[1]) @raises(ValueError) def test_sgd_bad_penalty(self): # Check whether expected ValueError on bad penalty self.factory(penalty='foobar', l1_ratio=0.85) @raises(ValueError) def test_sgd_bad_loss(self): # Check whether expected ValueError on bad loss self.factory(loss="foobar") def test_sgd_averaged_computed_correctly(self): # Tests the average regressor matches the naive implementation eta = .001 alpha = .01 n_samples = 20 n_features = 10 rng = np.random.RandomState(0) X = rng.normal(size=(n_samples, n_features)) w = rng.normal(size=n_features) # simple linear function without noise y = np.dot(X, w) clf = self.factory(loss='squared_loss', learning_rate='constant', eta0=eta, alpha=alpha, fit_intercept=True, max_iter=1, average=True, shuffle=False) clf.fit(X, y) average_weights, average_intercept = self.asgd(X, y, eta, alpha) assert_array_almost_equal(clf.coef_, average_weights, decimal=16) assert_almost_equal(clf.intercept_, average_intercept, decimal=16) def test_sgd_averaged_partial_fit(self): # Tests whether the partial fit yields the same average as the fit eta = .001 alpha = .01 n_samples = 20 n_features = 10 rng = np.random.RandomState(0) X = rng.normal(size=(n_samples, n_features)) w = rng.normal(size=n_features) # simple linear function without noise y = np.dot(X, w) clf = self.factory(loss='squared_loss', learning_rate='constant', eta0=eta, alpha=alpha, fit_intercept=True, max_iter=1, average=True, shuffle=False) clf.partial_fit(X[:int(n_samples / 2)][:], y[:int(n_samples / 2)]) clf.partial_fit(X[int(n_samples / 2):][:], y[int(n_samples / 2):]) average_weights, average_intercept = self.asgd(X, y, eta, alpha) assert_array_almost_equal(clf.coef_, average_weights, decimal=16) assert_almost_equal(clf.intercept_[0], average_intercept, decimal=16) def test_average_sparse(self): # Checks the average weights on data with 0s eta = .001 alpha = .01 clf = self.factory(loss='squared_loss', learning_rate='constant', eta0=eta, alpha=alpha, fit_intercept=True, max_iter=1, average=True, shuffle=False) n_samples = Y3.shape[0] clf.partial_fit(X3[:int(n_samples / 2)][:], Y3[:int(n_samples / 2)]) clf.partial_fit(X3[int(n_samples / 2):][:], Y3[int(n_samples / 2):]) average_weights, average_intercept = self.asgd(X3, Y3, eta, alpha) assert_array_almost_equal(clf.coef_, average_weights, decimal=16) assert_almost_equal(clf.intercept_, average_intercept, decimal=16) def test_sgd_least_squares_fit(self): xmin, xmax = -5, 5 n_samples = 100 rng = np.random.RandomState(0) X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1) # simple linear function without noise y = 0.5 * X.ravel() clf = self.factory(loss='squared_loss', alpha=0.1, max_iter=20, fit_intercept=False) clf.fit(X, y) score = clf.score(X, y) assert_greater(score, 0.99) # simple linear function with noise y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel() clf = self.factory(loss='squared_loss', alpha=0.1, max_iter=20, fit_intercept=False) clf.fit(X, y) score = clf.score(X, y) assert_greater(score, 0.5) def test_sgd_epsilon_insensitive(self): xmin, xmax = -5, 5 n_samples = 100 rng = np.random.RandomState(0) X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1) # simple linear function without noise y = 0.5 * X.ravel() clf = self.factory(loss='epsilon_insensitive', epsilon=0.01, alpha=0.1, max_iter=20, fit_intercept=False) clf.fit(X, y) score = clf.score(X, y) assert_true(score > 0.99) # simple linear function with noise y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel() clf = self.factory(loss='epsilon_insensitive', epsilon=0.01, alpha=0.1, max_iter=20, fit_intercept=False) clf.fit(X, y) score = clf.score(X, y) assert_true(score > 0.5) def test_sgd_huber_fit(self): xmin, xmax = -5, 5 n_samples = 100 rng = np.random.RandomState(0) X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1) # simple linear function without noise y = 0.5 * X.ravel() clf = self.factory(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False) clf.fit(X, y) score = clf.score(X, y) assert_greater(score, 0.99) # simple linear function with noise y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel() clf = self.factory(loss="huber", epsilon=0.1, alpha=0.1, max_iter=20, fit_intercept=False) clf.fit(X, y) score = clf.score(X, y) assert_greater(score, 0.5) def test_elasticnet_convergence(self): # Check that the SGD output is consistent with coordinate descent n_samples, n_features = 1000, 5 rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) # ground_truth linear model that generate y from X and to which the # models should converge if the regularizer would be set to 0.0 ground_truth_coef = rng.randn(n_features) y = np.dot(X, ground_truth_coef) # XXX: alpha = 0.1 seems to cause convergence problems for alpha in [0.01, 0.001]: for l1_ratio in [0.5, 0.8, 1.0]: cd = linear_model.ElasticNet(alpha=alpha, l1_ratio=l1_ratio, fit_intercept=False) cd.fit(X, y) sgd = self.factory(penalty='elasticnet', max_iter=50, alpha=alpha, l1_ratio=l1_ratio, fit_intercept=False) sgd.fit(X, y) err_msg = ("cd and sgd did not converge to comparable " "results for alpha=%f and l1_ratio=%f" % (alpha, l1_ratio)) assert_almost_equal(cd.coef_, sgd.coef_, decimal=2, err_msg=err_msg) @ignore_warnings def test_partial_fit(self): third = X.shape[0] // 3 clf = self.factory(alpha=0.01) clf.partial_fit(X[:third], Y[:third]) assert_equal(clf.coef_.shape, (X.shape[1], )) assert_equal(clf.intercept_.shape, (1,)) assert_equal(clf.predict([[0, 0]]).shape, (1, )) id1 = id(clf.coef_.data) clf.partial_fit(X[third:], Y[third:]) id2 = id(clf.coef_.data) # check that coef_ haven't been re-allocated assert_true(id1, id2) def _test_partial_fit_equal_fit(self, lr): clf = self.factory(alpha=0.01, max_iter=2, eta0=0.01, learning_rate=lr, shuffle=False) clf.fit(X, Y) y_pred = clf.predict(T) t = clf.t_ clf = self.factory(alpha=0.01, eta0=0.01, learning_rate=lr, shuffle=False) for i in range(2): clf.partial_fit(X, Y) y_pred2 = clf.predict(T) assert_equal(clf.t_, t) assert_array_almost_equal(y_pred, y_pred2, decimal=2) def test_partial_fit_equal_fit_constant(self): self._test_partial_fit_equal_fit("constant") def test_partial_fit_equal_fit_optimal(self): self._test_partial_fit_equal_fit("optimal") def test_partial_fit_equal_fit_invscaling(self): self._test_partial_fit_equal_fit("invscaling") def test_loss_function_epsilon(self): clf = self.factory(epsilon=0.9) clf.set_params(epsilon=0.1) assert clf.loss_functions['huber'][1] == 0.1 class SparseSGDRegressorTestCase(DenseSGDRegressorTestCase): # Run exactly the same tests using the sparse representation variant factory_class = SparseSGDRegressor def test_l1_ratio(): # Test if l1 ratio extremes match L1 and L2 penalty settings. X, y = datasets.make_classification(n_samples=1000, n_features=100, n_informative=20, random_state=1234) # test if elasticnet with l1_ratio near 1 gives same result as pure l1 est_en = SGDClassifier(alpha=0.001, penalty='elasticnet', tol=None, max_iter=6, l1_ratio=0.9999999999, random_state=42).fit(X, y) est_l1 = SGDClassifier(alpha=0.001, penalty='l1', max_iter=6, random_state=42, tol=None).fit(X, y) assert_array_almost_equal(est_en.coef_, est_l1.coef_) # test if elasticnet with l1_ratio near 0 gives same result as pure l2 est_en = SGDClassifier(alpha=0.001, penalty='elasticnet', tol=None, max_iter=6, l1_ratio=0.0000000001, random_state=42).fit(X, y) est_l2 = SGDClassifier(alpha=0.001, penalty='l2', max_iter=6, random_state=42, tol=None).fit(X, y) assert_array_almost_equal(est_en.coef_, est_l2.coef_) def test_underflow_or_overlow(): with np.errstate(all='raise'): # Generate some weird data with hugely unscaled features rng = np.random.RandomState(0) n_samples = 100 n_features = 10 X = rng.normal(size=(n_samples, n_features)) X[:, :2] *= 1e300 assert_true(np.isfinite(X).all()) # Use MinMaxScaler to scale the data without introducing a numerical # instability (computing the standard deviation naively is not possible # on this data) X_scaled = MinMaxScaler().fit_transform(X) assert_true(np.isfinite(X_scaled).all()) # Define a ground truth on the scaled data ground_truth = rng.normal(size=n_features) y = (np.dot(X_scaled, ground_truth) > 0.).astype(np.int32) assert_array_equal(np.unique(y), [0, 1]) model = SGDClassifier(alpha=0.1, loss='squared_hinge', max_iter=500) # smoke test: model is stable on scaled data model.fit(X_scaled, y) assert_true(np.isfinite(model.coef_).all()) # model is numerically unstable on unscaled data msg_regxp = (r"Floating-point under-/overflow occurred at epoch #.*" " Scaling input data with StandardScaler or MinMaxScaler" " might help.") assert_raises_regexp(ValueError, msg_regxp, model.fit, X, y) def test_numerical_stability_large_gradient(): # Non regression test case for numerical stability on scaled problems # where the gradient can still explode with some losses model = SGDClassifier(loss='squared_hinge', max_iter=10, shuffle=True, penalty='elasticnet', l1_ratio=0.3, alpha=0.01, eta0=0.001, random_state=0, tol=None) with np.errstate(all='raise'): model.fit(iris.data, iris.target) assert_true(np.isfinite(model.coef_).all()) def test_large_regularization(): # Non regression tests for numerical stability issues caused by large # regularization parameters for penalty in ['l2', 'l1', 'elasticnet']: model = SGDClassifier(alpha=1e5, learning_rate='constant', eta0=0.1, penalty=penalty, shuffle=False, tol=None, max_iter=6) with np.errstate(all='raise'): model.fit(iris.data, iris.target) assert_array_almost_equal(model.coef_, np.zeros_like(model.coef_)) def test_tol_parameter(): # Test that the tol parameter behaves as expected X = StandardScaler().fit_transform(iris.data) y = iris.target == 1 # With tol is None, the number of iteration should be equal to max_iter max_iter = 42 model_0 = SGDClassifier(tol=None, random_state=0, max_iter=max_iter) model_0.fit(X, y) assert_equal(max_iter, model_0.n_iter_) # If tol is not None, the number of iteration should be less than max_iter max_iter = 2000 model_1 = SGDClassifier(tol=0, random_state=0, max_iter=max_iter) model_1.fit(X, y) assert_greater(max_iter, model_1.n_iter_) assert_greater(model_1.n_iter_, 5) # A larger tol should yield a smaller number of iteration model_2 = SGDClassifier(tol=0.1, random_state=0, max_iter=max_iter) model_2.fit(X, y) assert_greater(model_1.n_iter_, model_2.n_iter_) assert_greater(model_2.n_iter_, 3) # Strict tolerance and small max_iter should trigger a warning model_3 = SGDClassifier(max_iter=3, tol=1e-3, random_state=0) model_3 = assert_warns(ConvergenceWarning, model_3.fit, X, y) assert_equal(model_3.n_iter_, 3) def test_future_and_deprecation_warnings(): # Test that warnings are raised. Will be removed in 0.21 def init(max_iter=None, tol=None, n_iter=None): sgd = SGDClassifier(max_iter=max_iter, tol=tol, n_iter=n_iter) sgd._validate_params() # When all default values are used msg_future = "max_iter and tol parameters have been added in " assert_warns_message(FutureWarning, msg_future, init) # When n_iter is specified msg_deprecation = "n_iter parameter is deprecated" assert_warns_message(DeprecationWarning, msg_deprecation, init, 6, 0, 5) # When n_iter=None, and at least one of tol and max_iter is specified assert_no_warnings(init, 100, None, None) assert_no_warnings(init, None, 1e-3, None) assert_no_warnings(init, 100, 1e-3, None) @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def test_tol_and_max_iter_default_values(): # Test that the default values are correctly changed est = SGDClassifier() est._validate_params() assert_equal(est._tol, None) assert_equal(est._max_iter, 5) est = SGDClassifier(n_iter=42) est._validate_params() assert_equal(est._tol, None) assert_equal(est._max_iter, 42) est = SGDClassifier(tol=1e-2) est._validate_params() assert_equal(est._tol, 1e-2) assert_equal(est._max_iter, 1000) est = SGDClassifier(max_iter=42) est._validate_params() assert_equal(est._tol, None) assert_equal(est._max_iter, 42) est = SGDClassifier(max_iter=42, tol=1e-2) est._validate_params() assert_equal(est._tol, 1e-2) assert_equal(est._max_iter, 42) def _test_gradient_common(loss_function, cases): # Test gradient of different loss functions # cases is a list of (p, y, expected) for p, y, expected in cases: assert_almost_equal(loss_function.dloss(p, y), expected) def test_gradient_hinge(): # Test Hinge (hinge / perceptron) # hinge loss = sgd_fast.Hinge(1.0) cases = [ # (p, y, expected) (1.1, 1.0, 0.0), (-2.0, -1.0, 0.0), (1.0, 1.0, -1.0), (-1.0, -1.0, 1.0), (0.5, 1.0, -1.0), (2.0, -1.0, 1.0), (-0.5, -1.0, 1.0), (0.0, 1.0, -1.0) ] _test_gradient_common(loss, cases) # perceptron loss = sgd_fast.Hinge(0.0) cases = [ # (p, y, expected) (1.0, 1.0, 0.0), (-0.1, -1.0, 0.0), (0.0, 1.0, -1.0), (0.0, -1.0, 1.0), (0.5, -1.0, 1.0), (2.0, -1.0, 1.0), (-0.5, 1.0, -1.0), (-1.0, 1.0, -1.0), ] _test_gradient_common(loss, cases) def test_gradient_squared_hinge(): # Test SquaredHinge loss = sgd_fast.SquaredHinge(1.0) cases = [ # (p, y, expected) (1.0, 1.0, 0.0), (-2.0, -1.0, 0.0), (1.0, -1.0, 4.0), (-1.0, 1.0, -4.0), (0.5, 1.0, -1.0), (0.5, -1.0, 3.0) ] _test_gradient_common(loss, cases) def test_gradient_log(): # Test Log (logistic loss) loss = sgd_fast.Log() cases = [ # (p, y, expected) (1.0, 1.0, -1.0 / (np.exp(1.0) + 1.0)), (1.0, -1.0, 1.0 / (np.exp(-1.0) + 1.0)), (-1.0, -1.0, 1.0 / (np.exp(1.0) + 1.0)), (-1.0, 1.0, -1.0 / (np.exp(-1.0) + 1.0)), (0.0, 1.0, -0.5), (0.0, -1.0, 0.5), (17.9, -1.0, 1.0), (-17.9, 1.0, -1.0), ] _test_gradient_common(loss, cases) assert_almost_equal(loss.dloss(18.1, 1.0), np.exp(-18.1) * -1.0, 16) assert_almost_equal(loss.dloss(-18.1, -1.0), np.exp(-18.1) * 1.0, 16) def test_gradient_squared_loss(): # Test SquaredLoss loss = sgd_fast.SquaredLoss() cases = [ # (p, y, expected) (0.0, 0.0, 0.0), (1.0, 1.0, 0.0), (1.0, 0.0, 1.0), (0.5, -1.0, 1.5), (-2.5, 2.0, -4.5) ] _test_gradient_common(loss, cases) def test_gradient_huber(): # Test Huber loss = sgd_fast.Huber(0.1) cases = [ # (p, y, expected) (0.0, 0.0, 0.0), (0.1, 0.0, 0.1), (0.0, 0.1, -0.1), (3.95, 4.0, -0.05), (5.0, 2.0, 0.1), (-1.0, 5.0, -0.1) ] _test_gradient_common(loss, cases) def test_gradient_modified_huber(): # Test ModifiedHuber loss = sgd_fast.ModifiedHuber() cases = [ # (p, y, expected) (1.0, 1.0, 0.0), (-1.0, -1.0, 0.0), (2.0, 1.0, 0.0), (0.0, 1.0, -2.0), (-1.0, 1.0, -4.0), (0.5, -1.0, 3.0), (0.5, -1.0, 3.0), (-2.0, 1.0, -4.0), (-3.0, 1.0, -4.0) ] _test_gradient_common(loss, cases) def test_gradient_epsilon_insensitive(): # Test EpsilonInsensitive loss = sgd_fast.EpsilonInsensitive(0.1) cases = [ (0.0, 0.0, 0.0), (0.1, 0.0, 0.0), (-2.05, -2.0, 0.0), (3.05, 3.0, 0.0), (2.2, 2.0, 1.0), (2.0, -1.0, 1.0), (2.0, 2.2, -1.0), (-2.0, 1.0, -1.0) ] _test_gradient_common(loss, cases) def test_gradient_squared_epsilon_insensitive(): # Test SquaredEpsilonInsensitive loss = sgd_fast.SquaredEpsilonInsensitive(0.1) cases = [ (0.0, 0.0, 0.0), (0.1, 0.0, 0.0), (-2.05, -2.0, 0.0), (3.05, 3.0, 0.0), (2.2, 2.0, 0.2), (2.0, -1.0, 5.8), (2.0, 2.2, -0.2), (-2.0, 1.0, -5.8) ] _test_gradient_common(loss, cases)