""" Ridge regression """ # Author: Mathieu Blondel # Reuben Fletcher-Costin # Fabian Pedregosa # Michael Eickenberg # License: BSD 3 clause from abc import ABCMeta, abstractmethod import warnings import numpy as np from scipy import linalg from scipy import sparse from scipy.sparse import linalg as sp_linalg from .base import LinearClassifierMixin, LinearModel, _rescale_data from .sag import sag_solver from ..base import RegressorMixin from ..utils.extmath import safe_sparse_dot from ..utils.extmath import row_norms from ..utils import check_X_y from ..utils import check_array from ..utils import check_consistent_length from ..utils import compute_sample_weight from ..utils import column_or_1d from ..preprocessing import LabelBinarizer from ..model_selection import GridSearchCV from ..externals import six from ..metrics.scorer import check_scoring def _solve_sparse_cg(X, y, alpha, max_iter=None, tol=1e-3, verbose=0): n_samples, n_features = X.shape X1 = sp_linalg.aslinearoperator(X) coefs = np.empty((y.shape[1], n_features), dtype=X.dtype) if n_features > n_samples: def create_mv(curr_alpha): def _mv(x): return X1.matvec(X1.rmatvec(x)) + curr_alpha * x return _mv else: def create_mv(curr_alpha): def _mv(x): return X1.rmatvec(X1.matvec(x)) + curr_alpha * x return _mv for i in range(y.shape[1]): y_column = y[:, i] mv = create_mv(alpha[i]) if n_features > n_samples: # kernel ridge # w = X.T * inv(X X^t + alpha*Id) y C = sp_linalg.LinearOperator( (n_samples, n_samples), matvec=mv, dtype=X.dtype) coef, info = sp_linalg.cg(C, y_column, tol=tol) coefs[i] = X1.rmatvec(coef) else: # linear ridge # w = inv(X^t X + alpha*Id) * X.T y y_column = X1.rmatvec(y_column) C = sp_linalg.LinearOperator( (n_features, n_features), matvec=mv, dtype=X.dtype) coefs[i], info = sp_linalg.cg(C, y_column, maxiter=max_iter, tol=tol) if info < 0: raise ValueError("Failed with error code %d" % info) if max_iter is None and info > 0 and verbose: warnings.warn("sparse_cg did not converge after %d iterations." % info) return coefs def _solve_lsqr(X, y, alpha, max_iter=None, tol=1e-3): n_samples, n_features = X.shape coefs = np.empty((y.shape[1], n_features), dtype=X.dtype) n_iter = np.empty(y.shape[1], dtype=np.int32) # According to the lsqr documentation, alpha = damp^2. sqrt_alpha = np.sqrt(alpha) for i in range(y.shape[1]): y_column = y[:, i] info = sp_linalg.lsqr(X, y_column, damp=sqrt_alpha[i], atol=tol, btol=tol, iter_lim=max_iter) coefs[i] = info[0] n_iter[i] = info[2] return coefs, n_iter def _solve_cholesky(X, y, alpha): # w = inv(X^t X + alpha*Id) * X.T y n_samples, n_features = X.shape n_targets = y.shape[1] A = safe_sparse_dot(X.T, X, dense_output=True) Xy = safe_sparse_dot(X.T, y, dense_output=True) one_alpha = np.array_equal(alpha, len(alpha) * [alpha[0]]) if one_alpha: A.flat[::n_features + 1] += alpha[0] return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T else: coefs = np.empty([n_targets, n_features], dtype=X.dtype) for coef, target, current_alpha in zip(coefs, Xy.T, alpha): A.flat[::n_features + 1] += current_alpha coef[:] = linalg.solve(A, target, sym_pos=True, overwrite_a=False).ravel() A.flat[::n_features + 1] -= current_alpha return coefs def _solve_cholesky_kernel(K, y, alpha, sample_weight=None, copy=False): # dual_coef = inv(X X^t + alpha*Id) y n_samples = K.shape[0] n_targets = y.shape[1] if copy: K = K.copy() alpha = np.atleast_1d(alpha) one_alpha = (alpha == alpha[0]).all() has_sw = isinstance(sample_weight, np.ndarray) \ or sample_weight not in [1.0, None] if has_sw: # Unlike other solvers, we need to support sample_weight directly # because K might be a pre-computed kernel. sw = np.sqrt(np.atleast_1d(sample_weight)) y = y * sw[:, np.newaxis] K *= np.outer(sw, sw) if one_alpha: # Only one penalty, we can solve multi-target problems in one time. K.flat[::n_samples + 1] += alpha[0] try: # Note: we must use overwrite_a=False in order to be able to # use the fall-back solution below in case a LinAlgError # is raised dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False) except np.linalg.LinAlgError: warnings.warn("Singular matrix in solving dual problem. Using " "least-squares solution instead.") dual_coef = linalg.lstsq(K, y)[0] # K is expensive to compute and store in memory so change it back in # case it was user-given. K.flat[::n_samples + 1] -= alpha[0] if has_sw: dual_coef *= sw[:, np.newaxis] return dual_coef else: # One penalty per target. We need to solve each target separately. dual_coefs = np.empty([n_targets, n_samples], K.dtype) for dual_coef, target, current_alpha in zip(dual_coefs, y.T, alpha): K.flat[::n_samples + 1] += current_alpha dual_coef[:] = linalg.solve(K, target, sym_pos=True, overwrite_a=False).ravel() K.flat[::n_samples + 1] -= current_alpha if has_sw: dual_coefs *= sw[np.newaxis, :] return dual_coefs.T def _solve_svd(X, y, alpha): U, s, Vt = linalg.svd(X, full_matrices=False) idx = s > 1e-15 # same default value as scipy.linalg.pinv s_nnz = s[idx][:, np.newaxis] UTy = np.dot(U.T, y) d = np.zeros((s.size, alpha.size), dtype=X.dtype) d[idx] = s_nnz / (s_nnz ** 2 + alpha) d_UT_y = d * UTy return np.dot(Vt.T, d_UT_y).T def ridge_regression(X, y, alpha, sample_weight=None, solver='auto', max_iter=None, tol=1e-3, verbose=0, random_state=None, return_n_iter=False, return_intercept=False): """Solve the ridge equation by the method of normal equations. Read more in the :ref:`User Guide `. Parameters ---------- X : {array-like, sparse matrix, LinearOperator}, shape = [n_samples, n_features] Training data y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values alpha : {float, array-like}, shape = [n_targets] if array-like Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. Alpha corresponds to ``C^-1`` in other linear models such as LogisticRegression or LinearSVC. If an array is passed, penalties are assumed to be specific to the targets. Hence they must correspond in number. sample_weight : float or numpy array of shape [n_samples] Individual weights for each sample. If sample_weight is not None and solver='auto', the solver will be set to 'cholesky'. .. versionadded:: 0.17 solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'} Solver to use in the computational routines: - 'auto' chooses the solver automatically based on the type of data. - 'svd' uses a Singular Value Decomposition of X to compute the Ridge coefficients. More stable for singular matrices than 'cholesky'. - 'cholesky' uses the standard scipy.linalg.solve function to obtain a closed-form solution via a Cholesky decomposition of dot(X.T, X) - 'sparse_cg' uses the conjugate gradient solver as found in scipy.sparse.linalg.cg. As an iterative algorithm, this solver is more appropriate than 'cholesky' for large-scale data (possibility to set `tol` and `max_iter`). - 'lsqr' uses the dedicated regularized least-squares routine scipy.sparse.linalg.lsqr. It is the fastest but may not be available in old scipy versions. It also uses an iterative procedure. - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses its improved, unbiased version named SAGA. Both methods also use an iterative procedure, and are often faster than other solvers when both n_samples and n_features are large. Note that 'sag' and 'saga' fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from sklearn.preprocessing. All last five solvers support both dense and sparse data. However, only 'sag' and 'saga' supports sparse input when`fit_intercept` is True. .. versionadded:: 0.17 Stochastic Average Gradient descent solver. .. versionadded:: 0.19 SAGA solver. max_iter : int, optional Maximum number of iterations for conjugate gradient solver. For the 'sparse_cg' and 'lsqr' solvers, the default value is determined by scipy.sparse.linalg. For 'sag' and saga solver, the default value is 1000. tol : float Precision of the solution. verbose : int Verbosity level. Setting verbose > 0 will display additional information depending on the solver used. random_state : int, RandomState instance or None, optional, default None The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used when ``solver`` == 'sag'. return_n_iter : boolean, default False If True, the method also returns `n_iter`, the actual number of iteration performed by the solver. .. versionadded:: 0.17 return_intercept : boolean, default False If True and if X is sparse, the method also returns the intercept, and the solver is automatically changed to 'sag'. This is only a temporary fix for fitting the intercept with sparse data. For dense data, use sklearn.linear_model._preprocess_data before your regression. .. versionadded:: 0.17 Returns ------- coef : array, shape = [n_features] or [n_targets, n_features] Weight vector(s). n_iter : int, optional The actual number of iteration performed by the solver. Only returned if `return_n_iter` is True. intercept : float or array, shape = [n_targets] The intercept of the model. Only returned if `return_intercept` is True and if X is a scipy sparse array. Notes ----- This function won't compute the intercept. """ if return_intercept and sparse.issparse(X) and solver != 'sag': if solver != 'auto': warnings.warn("In Ridge, only 'sag' solver can currently fit the " "intercept when X is sparse. Solver has been " "automatically changed into 'sag'.") solver = 'sag' _dtype = [np.float64, np.float32] # SAG needs X and y columns to be C-contiguous and np.float64 if solver in ['sag', 'saga']: X = check_array(X, accept_sparse=['csr'], dtype=np.float64, order='C') y = check_array(y, dtype=np.float64, ensure_2d=False, order='F') else: X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=_dtype) y = check_array(y, dtype=X.dtype, ensure_2d=False) check_consistent_length(X, y) n_samples, n_features = X.shape if y.ndim > 2: raise ValueError("Target y has the wrong shape %s" % str(y.shape)) ravel = False if y.ndim == 1: y = y.reshape(-1, 1) ravel = True n_samples_, n_targets = y.shape if n_samples != n_samples_: raise ValueError("Number of samples in X and y does not correspond:" " %d != %d" % (n_samples, n_samples_)) has_sw = sample_weight is not None if solver == 'auto': # cholesky if it's a dense array and cg in any other case if not sparse.issparse(X) or has_sw: solver = 'cholesky' else: solver = 'sparse_cg' elif solver == 'lsqr' and not hasattr(sp_linalg, 'lsqr'): warnings.warn("""lsqr not available on this machine, falling back to sparse_cg.""") solver = 'sparse_cg' if has_sw: if np.atleast_1d(sample_weight).ndim > 1: raise ValueError("Sample weights must be 1D array or scalar") if solver not in ['sag', 'saga']: # SAG supports sample_weight directly. For other solvers, # we implement sample_weight via a simple rescaling. X, y = _rescale_data(X, y, sample_weight) # There should be either 1 or n_targets penalties alpha = np.asarray(alpha, dtype=X.dtype).ravel() if alpha.size not in [1, n_targets]: raise ValueError("Number of targets and number of penalties " "do not correspond: %d != %d" % (alpha.size, n_targets)) if alpha.size == 1 and n_targets > 1: alpha = np.repeat(alpha, n_targets) if solver not in ('sparse_cg', 'cholesky', 'svd', 'lsqr', 'sag', 'saga'): raise ValueError('Solver %s not understood' % solver) n_iter = None if solver == 'sparse_cg': coef = _solve_sparse_cg(X, y, alpha, max_iter, tol, verbose) elif solver == 'lsqr': coef, n_iter = _solve_lsqr(X, y, alpha, max_iter, tol) elif solver == 'cholesky': if n_features > n_samples: K = safe_sparse_dot(X, X.T, dense_output=True) try: dual_coef = _solve_cholesky_kernel(K, y, alpha) coef = safe_sparse_dot(X.T, dual_coef, dense_output=True).T except linalg.LinAlgError: # use SVD solver if matrix is singular solver = 'svd' else: try: coef = _solve_cholesky(X, y, alpha) except linalg.LinAlgError: # use SVD solver if matrix is singular solver = 'svd' elif solver in ['sag', 'saga']: # precompute max_squared_sum for all targets max_squared_sum = row_norms(X, squared=True).max() coef = np.empty((y.shape[1], n_features)) n_iter = np.empty(y.shape[1], dtype=np.int32) intercept = np.zeros((y.shape[1], )) for i, (alpha_i, target) in enumerate(zip(alpha, y.T)): init = {'coef': np.zeros((n_features + int(return_intercept), 1))} coef_, n_iter_, _ = sag_solver( X, target.ravel(), sample_weight, 'squared', alpha_i, 0, max_iter, tol, verbose, random_state, False, max_squared_sum, init, is_saga=solver == 'saga') if return_intercept: coef[i] = coef_[:-1] intercept[i] = coef_[-1] else: coef[i] = coef_ n_iter[i] = n_iter_ if intercept.shape[0] == 1: intercept = intercept[0] coef = np.asarray(coef) if solver == 'svd': if sparse.issparse(X): raise TypeError('SVD solver does not support sparse' ' inputs currently') coef = _solve_svd(X, y, alpha) if ravel: # When y was passed as a 1d-array, we flatten the coefficients. coef = coef.ravel() if return_n_iter and return_intercept: return coef, n_iter, intercept elif return_intercept: return coef, intercept elif return_n_iter: return coef, n_iter else: return coef class _BaseRidge(six.with_metaclass(ABCMeta, LinearModel)): @abstractmethod def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=1e-3, solver="auto", random_state=None): self.alpha = alpha self.fit_intercept = fit_intercept self.normalize = normalize self.copy_X = copy_X self.max_iter = max_iter self.tol = tol self.solver = solver self.random_state = random_state def fit(self, X, y, sample_weight=None): if self.solver in ('sag', 'saga'): _dtype = np.float64 else: # all other solvers work at both float precision levels _dtype = [np.float64, np.float32] X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=_dtype, multi_output=True, y_numeric=True) if ((sample_weight is not None) and np.atleast_1d(sample_weight).ndim > 1): raise ValueError("Sample weights must be 1D array or scalar") X, y, X_offset, y_offset, X_scale = self._preprocess_data( X, y, self.fit_intercept, self.normalize, self.copy_X, sample_weight=sample_weight) # temporary fix for fitting the intercept with sparse data using 'sag' if sparse.issparse(X) and self.fit_intercept: self.coef_, self.n_iter_, self.intercept_ = ridge_regression( X, y, alpha=self.alpha, sample_weight=sample_weight, max_iter=self.max_iter, tol=self.tol, solver=self.solver, random_state=self.random_state, return_n_iter=True, return_intercept=True) self.intercept_ += y_offset else: self.coef_, self.n_iter_ = ridge_regression( X, y, alpha=self.alpha, sample_weight=sample_weight, max_iter=self.max_iter, tol=self.tol, solver=self.solver, random_state=self.random_state, return_n_iter=True, return_intercept=False) self._set_intercept(X_offset, y_offset, X_scale) return self class Ridge(_BaseRidge, RegressorMixin): """Linear least squares with l2 regularization. This model solves a regression model where the loss function is the linear least squares function and regularization is given by the l2-norm. Also known as Ridge Regression or Tikhonov regularization. This estimator has built-in support for multi-variate regression (i.e., when y is a 2d-array of shape [n_samples, n_targets]). Read more in the :ref:`User Guide `. Parameters ---------- alpha : {float, array-like}, shape (n_targets) Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. Alpha corresponds to ``C^-1`` in other linear models such as LogisticRegression or LinearSVC. If an array is passed, penalties are assumed to be specific to the targets. Hence they must correspond in number. fit_intercept : boolean Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). normalize : boolean, optional, default False This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. copy_X : boolean, optional, default True If True, X will be copied; else, it may be overwritten. max_iter : int, optional Maximum number of iterations for conjugate gradient solver. For 'sparse_cg' and 'lsqr' solvers, the default value is determined by scipy.sparse.linalg. For 'sag' solver, the default value is 1000. tol : float Precision of the solution. solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'} Solver to use in the computational routines: - 'auto' chooses the solver automatically based on the type of data. - 'svd' uses a Singular Value Decomposition of X to compute the Ridge coefficients. More stable for singular matrices than 'cholesky'. - 'cholesky' uses the standard scipy.linalg.solve function to obtain a closed-form solution. - 'sparse_cg' uses the conjugate gradient solver as found in scipy.sparse.linalg.cg. As an iterative algorithm, this solver is more appropriate than 'cholesky' for large-scale data (possibility to set `tol` and `max_iter`). - 'lsqr' uses the dedicated regularized least-squares routine scipy.sparse.linalg.lsqr. It is the fastest but may not be available in old scipy versions. It also uses an iterative procedure. - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses its improved, unbiased version named SAGA. Both methods also use an iterative procedure, and are often faster than other solvers when both n_samples and n_features are large. Note that 'sag' and 'saga' fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from sklearn.preprocessing. All last five solvers support both dense and sparse data. However, only 'sag' and 'saga' supports sparse input when `fit_intercept` is True. .. versionadded:: 0.17 Stochastic Average Gradient descent solver. .. versionadded:: 0.19 SAGA solver. random_state : int, RandomState instance or None, optional, default None The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used when ``solver`` == 'sag'. .. versionadded:: 0.17 *random_state* to support Stochastic Average Gradient. Attributes ---------- coef_ : array, shape (n_features,) or (n_targets, n_features) Weight vector(s). intercept_ : float | array, shape = (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. n_iter_ : array or None, shape (n_targets,) Actual number of iterations for each target. Available only for sag and lsqr solvers. Other solvers will return None. .. versionadded:: 0.17 See also -------- RidgeClassifier, RidgeCV, :class:`sklearn.kernel_ridge.KernelRidge` Examples -------- >>> from sklearn.linear_model import Ridge >>> import numpy as np >>> n_samples, n_features = 10, 5 >>> np.random.seed(0) >>> y = np.random.randn(n_samples) >>> X = np.random.randn(n_samples, n_features) >>> clf = Ridge(alpha=1.0) >>> clf.fit(X, y) # doctest: +NORMALIZE_WHITESPACE Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001) """ def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=1e-3, solver="auto", random_state=None): super(Ridge, self).__init__(alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, max_iter=max_iter, tol=tol, solver=solver, random_state=random_state) def fit(self, X, y, sample_weight=None): """Fit Ridge regression model Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training data y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values sample_weight : float or numpy array of shape [n_samples] Individual weights for each sample Returns ------- self : returns an instance of self. """ return super(Ridge, self).fit(X, y, sample_weight=sample_weight) class RidgeClassifier(LinearClassifierMixin, _BaseRidge): """Classifier using Ridge regression. Read more in the :ref:`User Guide `. Parameters ---------- alpha : float Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. Alpha corresponds to ``C^-1`` in other linear models such as LogisticRegression or LinearSVC. fit_intercept : boolean Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). normalize : boolean, optional, default False This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. copy_X : boolean, optional, default True If True, X will be copied; else, it may be overwritten. max_iter : int, optional Maximum number of iterations for conjugate gradient solver. The default value is determined by scipy.sparse.linalg. tol : float Precision of the solution. class_weight : dict or 'balanced', optional Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'} Solver to use in the computational routines: - 'auto' chooses the solver automatically based on the type of data. - 'svd' uses a Singular Value Decomposition of X to compute the Ridge coefficients. More stable for singular matrices than 'cholesky'. - 'cholesky' uses the standard scipy.linalg.solve function to obtain a closed-form solution. - 'sparse_cg' uses the conjugate gradient solver as found in scipy.sparse.linalg.cg. As an iterative algorithm, this solver is more appropriate than 'cholesky' for large-scale data (possibility to set `tol` and `max_iter`). - 'lsqr' uses the dedicated regularized least-squares routine scipy.sparse.linalg.lsqr. It is the fastest but may not be available in old scipy versions. It also uses an iterative procedure. - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses its unbiased and more flexible version named SAGA. Both methods use an iterative procedure, and are often faster than other solvers when both n_samples and n_features are large. Note that 'sag' and 'saga' fast convergence is only guaranteed on features with approximately the same scale. You can preprocess the data with a scaler from sklearn.preprocessing. .. versionadded:: 0.17 Stochastic Average Gradient descent solver. .. versionadded:: 0.19 SAGA solver. random_state : int, RandomState instance or None, optional, default None The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used when ``solver`` == 'sag'. Attributes ---------- coef_ : array, shape (n_features,) or (n_classes, n_features) Weight vector(s). intercept_ : float | array, shape = (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. n_iter_ : array or None, shape (n_targets,) Actual number of iterations for each target. Available only for sag and lsqr solvers. Other solvers will return None. See also -------- Ridge, RidgeClassifierCV Notes ----- For multi-class classification, n_class classifiers are trained in a one-versus-all approach. Concretely, this is implemented by taking advantage of the multi-variate response support in Ridge. """ def __init__(self, alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=1e-3, class_weight=None, solver="auto", random_state=None): super(RidgeClassifier, self).__init__( alpha=alpha, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, max_iter=max_iter, tol=tol, solver=solver, random_state=random_state) self.class_weight = class_weight def fit(self, X, y, sample_weight=None): """Fit Ridge regression model. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples,n_features] Training data y : array-like, shape = [n_samples] Target values sample_weight : float or numpy array of shape (n_samples,) Sample weight. .. versionadded:: 0.17 *sample_weight* support to Classifier. Returns ------- self : returns an instance of self. """ self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) Y = self._label_binarizer.fit_transform(y) if not self._label_binarizer.y_type_.startswith('multilabel'): y = column_or_1d(y, warn=True) else: # we don't (yet) support multi-label classification in Ridge raise ValueError( "%s doesn't support multi-label classification" % ( self.__class__.__name__)) if self.class_weight: if sample_weight is None: sample_weight = 1. # modify the sample weights with the corresponding class weight sample_weight = (sample_weight * compute_sample_weight(self.class_weight, y)) super(RidgeClassifier, self).fit(X, Y, sample_weight=sample_weight) return self @property def classes_(self): return self._label_binarizer.classes_ class _RidgeGCV(LinearModel): """Ridge regression with built-in Generalized Cross-Validation It allows efficient Leave-One-Out cross-validation. This class is not intended to be used directly. Use RidgeCV instead. Notes ----- We want to solve (K + alpha*Id)c = y, where K = X X^T is the kernel matrix. Let G = (K + alpha*Id)^-1. Dual solution: c = Gy Primal solution: w = X^T c Compute eigendecomposition K = Q V Q^T. Then G = Q (V + alpha*Id)^-1 Q^T, where (V + alpha*Id) is diagonal. It is thus inexpensive to inverse for many alphas. Let loov be the vector of prediction values for each example when the model was fitted with all examples but this example. loov = (KGY - diag(KG)Y) / diag(I-KG) Let looe be the vector of prediction errors for each example when the model was fitted with all examples but this example. looe = y - loov = c / diag(G) References ---------- http://cbcl.mit.edu/projects/cbcl/publications/ps/MIT-CSAIL-TR-2007-025.pdf http://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf """ def __init__(self, alphas=(0.1, 1.0, 10.0), fit_intercept=True, normalize=False, scoring=None, copy_X=True, gcv_mode=None, store_cv_values=False): self.alphas = np.asarray(alphas) self.fit_intercept = fit_intercept self.normalize = normalize self.scoring = scoring self.copy_X = copy_X self.gcv_mode = gcv_mode self.store_cv_values = store_cv_values def _pre_compute(self, X, y, centered_kernel=True): # even if X is very sparse, K is usually very dense K = safe_sparse_dot(X, X.T, dense_output=True) # the following emulates an additional constant regressor # corresponding to fit_intercept=True # but this is done only when the features have been centered if centered_kernel: K += np.ones_like(K) v, Q = linalg.eigh(K) QT_y = np.dot(Q.T, y) return v, Q, QT_y def _decomp_diag(self, v_prime, Q): # compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T)) return (v_prime * Q ** 2).sum(axis=-1) def _diag_dot(self, D, B): # compute dot(diag(D), B) if len(B.shape) > 1: # handle case where B is > 1-d D = D[(slice(None), ) + (np.newaxis, ) * (len(B.shape) - 1)] return D * B def _errors_and_values_helper(self, alpha, y, v, Q, QT_y): """Helper function to avoid code duplication between self._errors and self._values. Notes ----- We don't construct matrix G, instead compute action on y & diagonal. """ w = 1. / (v + alpha) constant_column = np.var(Q, 0) < 1.e-12 # detect constant columns w[constant_column] = 0 # cancel the regularization for the intercept c = np.dot(Q, self._diag_dot(w, QT_y)) G_diag = self._decomp_diag(w, Q) # handle case where y is 2-d if len(y.shape) != 1: G_diag = G_diag[:, np.newaxis] return G_diag, c def _errors(self, alpha, y, v, Q, QT_y): G_diag, c = self._errors_and_values_helper(alpha, y, v, Q, QT_y) return (c / G_diag) ** 2, c def _values(self, alpha, y, v, Q, QT_y): G_diag, c = self._errors_and_values_helper(alpha, y, v, Q, QT_y) return y - (c / G_diag), c def _pre_compute_svd(self, X, y, centered_kernel=True): if sparse.issparse(X): raise TypeError("SVD not supported for sparse matrices") if centered_kernel: X = np.hstack((X, np.ones((X.shape[0], 1)))) # to emulate fit_intercept=True situation, add a column on ones # Note that by centering, the other columns are orthogonal to that one U, s, _ = linalg.svd(X, full_matrices=0) v = s ** 2 UT_y = np.dot(U.T, y) return v, U, UT_y def _errors_and_values_svd_helper(self, alpha, y, v, U, UT_y): """Helper function to avoid code duplication between self._errors_svd and self._values_svd. """ constant_column = np.var(U, 0) < 1.e-12 # detect columns colinear to ones w = ((v + alpha) ** -1) - (alpha ** -1) w[constant_column] = - (alpha ** -1) # cancel the regularization for the intercept c = np.dot(U, self._diag_dot(w, UT_y)) + (alpha ** -1) * y G_diag = self._decomp_diag(w, U) + (alpha ** -1) if len(y.shape) != 1: # handle case where y is 2-d G_diag = G_diag[:, np.newaxis] return G_diag, c def _errors_svd(self, alpha, y, v, U, UT_y): G_diag, c = self._errors_and_values_svd_helper(alpha, y, v, U, UT_y) return (c / G_diag) ** 2, c def _values_svd(self, alpha, y, v, U, UT_y): G_diag, c = self._errors_and_values_svd_helper(alpha, y, v, U, UT_y) return y - (c / G_diag), c def fit(self, X, y, sample_weight=None): """Fit Ridge regression model Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training data y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values. Will be cast to X's dtype if necessary sample_weight : float or array-like of shape [n_samples] Sample weight Returns ------- self : Returns self. """ X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64, multi_output=True, y_numeric=True) if sample_weight is not None and not isinstance(sample_weight, float): sample_weight = check_array(sample_weight, ensure_2d=False) n_samples, n_features = X.shape X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data( X, y, self.fit_intercept, self.normalize, self.copy_X, sample_weight=sample_weight) gcv_mode = self.gcv_mode with_sw = len(np.shape(sample_weight)) if gcv_mode is None or gcv_mode == 'auto': if sparse.issparse(X) or n_features > n_samples or with_sw: gcv_mode = 'eigen' else: gcv_mode = 'svd' elif gcv_mode == "svd" and with_sw: # FIXME non-uniform sample weights not yet supported warnings.warn("non-uniform sample weights unsupported for svd, " "forcing usage of eigen") gcv_mode = 'eigen' if gcv_mode == 'eigen': _pre_compute = self._pre_compute _errors = self._errors _values = self._values elif gcv_mode == 'svd': # assert n_samples >= n_features _pre_compute = self._pre_compute_svd _errors = self._errors_svd _values = self._values_svd else: raise ValueError('bad gcv_mode "%s"' % gcv_mode) if sample_weight is not None: X, y = _rescale_data(X, y, sample_weight) centered_kernel = not sparse.issparse(X) and self.fit_intercept v, Q, QT_y = _pre_compute(X, y, centered_kernel) n_y = 1 if len(y.shape) == 1 else y.shape[1] cv_values = np.zeros((n_samples * n_y, len(self.alphas))) C = [] scorer = check_scoring(self, scoring=self.scoring, allow_none=True) error = scorer is None for i, alpha in enumerate(self.alphas): if error: out, c = _errors(alpha, y, v, Q, QT_y) else: out, c = _values(alpha, y, v, Q, QT_y) cv_values[:, i] = out.ravel() C.append(c) if error: best = cv_values.mean(axis=0).argmin() else: # The scorer want an object that will make the predictions but # they are already computed efficiently by _RidgeGCV. This # identity_estimator will just return them def identity_estimator(): pass identity_estimator.decision_function = lambda y_predict: y_predict identity_estimator.predict = lambda y_predict: y_predict out = [scorer(identity_estimator, y.ravel(), cv_values[:, i]) for i in range(len(self.alphas))] best = np.argmax(out) self.alpha_ = self.alphas[best] self.dual_coef_ = C[best] self.coef_ = safe_sparse_dot(self.dual_coef_.T, X) self._set_intercept(X_offset, y_offset, X_scale) if self.store_cv_values: if len(y.shape) == 1: cv_values_shape = n_samples, len(self.alphas) else: cv_values_shape = n_samples, n_y, len(self.alphas) self.cv_values_ = cv_values.reshape(cv_values_shape) return self class _BaseRidgeCV(LinearModel): def __init__(self, alphas=(0.1, 1.0, 10.0), fit_intercept=True, normalize=False, scoring=None, cv=None, gcv_mode=None, store_cv_values=False): self.alphas = alphas self.fit_intercept = fit_intercept self.normalize = normalize self.scoring = scoring self.cv = cv self.gcv_mode = gcv_mode self.store_cv_values = store_cv_values def fit(self, X, y, sample_weight=None): """Fit Ridge regression model Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data y : array-like, shape = [n_samples] or [n_samples, n_targets] Target values. Will be cast to X's dtype if necessary sample_weight : float or array-like of shape [n_samples] Sample weight Returns ------- self : Returns self. """ if self.cv is None: estimator = _RidgeGCV(self.alphas, fit_intercept=self.fit_intercept, normalize=self.normalize, scoring=self.scoring, gcv_mode=self.gcv_mode, store_cv_values=self.store_cv_values) estimator.fit(X, y, sample_weight=sample_weight) self.alpha_ = estimator.alpha_ if self.store_cv_values: self.cv_values_ = estimator.cv_values_ else: if self.store_cv_values: raise ValueError("cv!=None and store_cv_values=True " " are incompatible") parameters = {'alpha': self.alphas} gs = GridSearchCV(Ridge(fit_intercept=self.fit_intercept, normalize=self.normalize), parameters, cv=self.cv, scoring=self.scoring) gs.fit(X, y, sample_weight=sample_weight) estimator = gs.best_estimator_ self.alpha_ = gs.best_estimator_.alpha self.coef_ = estimator.coef_ self.intercept_ = estimator.intercept_ return self class RidgeCV(_BaseRidgeCV, RegressorMixin): """Ridge regression with built-in cross-validation. By default, it performs Generalized Cross-Validation, which is a form of efficient Leave-One-Out cross-validation. Read more in the :ref:`User Guide `. Parameters ---------- alphas : numpy array of shape [n_alphas] Array of alpha values to try. Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. Alpha corresponds to ``C^-1`` in other linear models such as LogisticRegression or LinearSVC. fit_intercept : boolean Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). normalize : boolean, optional, default False This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. scoring : string, callable or None, optional, default: None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the efficient Leave-One-Out cross-validation - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. For integer/None inputs, if ``y`` is binary or multiclass, :class:`sklearn.model_selection.StratifiedKFold` is used, else, :class:`sklearn.model_selection.KFold` is used. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. gcv_mode : {None, 'auto', 'svd', eigen'}, optional Flag indicating which strategy to use when performing Generalized Cross-Validation. Options are:: 'auto' : use svd if n_samples > n_features or when X is a sparse matrix, otherwise use eigen 'svd' : force computation via singular value decomposition of X (does not work for sparse matrices) 'eigen' : force computation via eigendecomposition of X^T X The 'auto' mode is the default and is intended to pick the cheaper option of the two depending upon the shape and format of the training data. store_cv_values : boolean, default=False Flag indicating if the cross-validation values corresponding to each alpha should be stored in the `cv_values_` attribute (see below). This flag is only compatible with `cv=None` (i.e. using Generalized Cross-Validation). Attributes ---------- cv_values_ : array, shape = [n_samples, n_alphas] or \ shape = [n_samples, n_targets, n_alphas], optional Cross-validation values for each alpha (if `store_cv_values=True` and \ `cv=None`). After `fit()` has been called, this attribute will \ contain the mean squared errors (by default) or the values of the \ `{loss,score}_func` function (if provided in the constructor). coef_ : array, shape = [n_features] or [n_targets, n_features] Weight vector(s). intercept_ : float | array, shape = (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. alpha_ : float Estimated regularization parameter. See also -------- Ridge: Ridge regression RidgeClassifier: Ridge classifier RidgeClassifierCV: Ridge classifier with built-in cross validation """ pass class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV): """Ridge classifier with built-in cross-validation. By default, it performs Generalized Cross-Validation, which is a form of efficient Leave-One-Out cross-validation. Currently, only the n_features > n_samples case is handled efficiently. Read more in the :ref:`User Guide `. Parameters ---------- alphas : numpy array of shape [n_alphas] Array of alpha values to try. Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. Alpha corresponds to ``C^-1`` in other linear models such as LogisticRegression or LinearSVC. fit_intercept : boolean Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). normalize : boolean, optional, default False This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``. scoring : string, callable or None, optional, default: None A string (see model evaluation documentation) or a scorer callable object / function with signature ``scorer(estimator, X, y)``. cv : int, cross-validation generator or an iterable, optional Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the efficient Leave-One-Out cross-validation - integer, to specify the number of folds. - An object to be used as a cross-validation generator. - An iterable yielding train/test splits. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. class_weight : dict or 'balanced', optional Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` Attributes ---------- cv_values_ : array, shape = [n_samples, n_alphas] or \ shape = [n_samples, n_responses, n_alphas], optional Cross-validation values for each alpha (if `store_cv_values=True` and `cv=None`). After `fit()` has been called, this attribute will contain \ the mean squared errors (by default) or the values of the \ `{loss,score}_func` function (if provided in the constructor). coef_ : array, shape = [n_features] or [n_targets, n_features] Weight vector(s). intercept_ : float | array, shape = (n_targets,) Independent term in decision function. Set to 0.0 if ``fit_intercept = False``. alpha_ : float Estimated regularization parameter See also -------- Ridge: Ridge regression RidgeClassifier: Ridge classifier RidgeCV: Ridge regression with built-in cross validation Notes ----- For multi-class classification, n_class classifiers are trained in a one-versus-all approach. Concretely, this is implemented by taking advantage of the multi-variate response support in Ridge. """ def __init__(self, alphas=(0.1, 1.0, 10.0), fit_intercept=True, normalize=False, scoring=None, cv=None, class_weight=None): super(RidgeClassifierCV, self).__init__( alphas=alphas, fit_intercept=fit_intercept, normalize=normalize, scoring=scoring, cv=cv) self.class_weight = class_weight def fit(self, X, y, sample_weight=None): """Fit the ridge classifier. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target values. Will be cast to X's dtype if necessary sample_weight : float or numpy array of shape (n_samples,) Sample weight. Returns ------- self : object Returns self. """ self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1) Y = self._label_binarizer.fit_transform(y) if not self._label_binarizer.y_type_.startswith('multilabel'): y = column_or_1d(y, warn=True) if self.class_weight: if sample_weight is None: sample_weight = 1. # modify the sample weights with the corresponding class weight sample_weight = (sample_weight * compute_sample_weight(self.class_weight, y)) _BaseRidgeCV.fit(self, X, Y, sample_weight=sample_weight) return self @property def classes_(self): return self._label_binarizer.classes_