1368 lines
52 KiB
Python
1368 lines
52 KiB
Python
"""
|
|
Ridge regression
|
|
"""
|
|
|
|
# Author: Mathieu Blondel <mathieu@mblondel.org>
|
|
# Reuben Fletcher-Costin <reuben.fletchercostin@gmail.com>
|
|
# Fabian Pedregosa <fabian@fseoane.net>
|
|
# Michael Eickenberg <michael.eickenberg@nsup.org>
|
|
# License: BSD 3 clause
|
|
|
|
|
|
from abc import ABCMeta, abstractmethod
|
|
import warnings
|
|
|
|
import numpy as np
|
|
from scipy import linalg
|
|
from scipy import sparse
|
|
from scipy.sparse import linalg as sp_linalg
|
|
|
|
from .base import LinearClassifierMixin, LinearModel, _rescale_data
|
|
from .sag import sag_solver
|
|
from ..base import RegressorMixin
|
|
from ..utils.extmath import safe_sparse_dot
|
|
from ..utils.extmath import row_norms
|
|
from ..utils import check_X_y
|
|
from ..utils import check_array
|
|
from ..utils import check_consistent_length
|
|
from ..utils import compute_sample_weight
|
|
from ..utils import column_or_1d
|
|
from ..preprocessing import LabelBinarizer
|
|
from ..model_selection import GridSearchCV
|
|
from ..externals import six
|
|
from ..metrics.scorer import check_scoring
|
|
|
|
|
|
def _solve_sparse_cg(X, y, alpha, max_iter=None, tol=1e-3, verbose=0):
|
|
n_samples, n_features = X.shape
|
|
X1 = sp_linalg.aslinearoperator(X)
|
|
coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)
|
|
|
|
if n_features > n_samples:
|
|
def create_mv(curr_alpha):
|
|
def _mv(x):
|
|
return X1.matvec(X1.rmatvec(x)) + curr_alpha * x
|
|
return _mv
|
|
else:
|
|
def create_mv(curr_alpha):
|
|
def _mv(x):
|
|
return X1.rmatvec(X1.matvec(x)) + curr_alpha * x
|
|
return _mv
|
|
|
|
for i in range(y.shape[1]):
|
|
y_column = y[:, i]
|
|
|
|
mv = create_mv(alpha[i])
|
|
if n_features > n_samples:
|
|
# kernel ridge
|
|
# w = X.T * inv(X X^t + alpha*Id) y
|
|
C = sp_linalg.LinearOperator(
|
|
(n_samples, n_samples), matvec=mv, dtype=X.dtype)
|
|
coef, info = sp_linalg.cg(C, y_column, tol=tol)
|
|
coefs[i] = X1.rmatvec(coef)
|
|
else:
|
|
# linear ridge
|
|
# w = inv(X^t X + alpha*Id) * X.T y
|
|
y_column = X1.rmatvec(y_column)
|
|
C = sp_linalg.LinearOperator(
|
|
(n_features, n_features), matvec=mv, dtype=X.dtype)
|
|
coefs[i], info = sp_linalg.cg(C, y_column, maxiter=max_iter,
|
|
tol=tol)
|
|
if info < 0:
|
|
raise ValueError("Failed with error code %d" % info)
|
|
|
|
if max_iter is None and info > 0 and verbose:
|
|
warnings.warn("sparse_cg did not converge after %d iterations." %
|
|
info)
|
|
|
|
return coefs
|
|
|
|
|
|
def _solve_lsqr(X, y, alpha, max_iter=None, tol=1e-3):
|
|
n_samples, n_features = X.shape
|
|
coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)
|
|
n_iter = np.empty(y.shape[1], dtype=np.int32)
|
|
|
|
# According to the lsqr documentation, alpha = damp^2.
|
|
sqrt_alpha = np.sqrt(alpha)
|
|
|
|
for i in range(y.shape[1]):
|
|
y_column = y[:, i]
|
|
info = sp_linalg.lsqr(X, y_column, damp=sqrt_alpha[i],
|
|
atol=tol, btol=tol, iter_lim=max_iter)
|
|
coefs[i] = info[0]
|
|
n_iter[i] = info[2]
|
|
|
|
return coefs, n_iter
|
|
|
|
|
|
def _solve_cholesky(X, y, alpha):
|
|
# w = inv(X^t X + alpha*Id) * X.T y
|
|
n_samples, n_features = X.shape
|
|
n_targets = y.shape[1]
|
|
|
|
A = safe_sparse_dot(X.T, X, dense_output=True)
|
|
Xy = safe_sparse_dot(X.T, y, dense_output=True)
|
|
|
|
one_alpha = np.array_equal(alpha, len(alpha) * [alpha[0]])
|
|
|
|
if one_alpha:
|
|
A.flat[::n_features + 1] += alpha[0]
|
|
return linalg.solve(A, Xy, sym_pos=True,
|
|
overwrite_a=True).T
|
|
else:
|
|
coefs = np.empty([n_targets, n_features], dtype=X.dtype)
|
|
for coef, target, current_alpha in zip(coefs, Xy.T, alpha):
|
|
A.flat[::n_features + 1] += current_alpha
|
|
coef[:] = linalg.solve(A, target, sym_pos=True,
|
|
overwrite_a=False).ravel()
|
|
A.flat[::n_features + 1] -= current_alpha
|
|
return coefs
|
|
|
|
|
|
def _solve_cholesky_kernel(K, y, alpha, sample_weight=None, copy=False):
|
|
# dual_coef = inv(X X^t + alpha*Id) y
|
|
n_samples = K.shape[0]
|
|
n_targets = y.shape[1]
|
|
|
|
if copy:
|
|
K = K.copy()
|
|
|
|
alpha = np.atleast_1d(alpha)
|
|
one_alpha = (alpha == alpha[0]).all()
|
|
has_sw = isinstance(sample_weight, np.ndarray) \
|
|
or sample_weight not in [1.0, None]
|
|
|
|
if has_sw:
|
|
# Unlike other solvers, we need to support sample_weight directly
|
|
# because K might be a pre-computed kernel.
|
|
sw = np.sqrt(np.atleast_1d(sample_weight))
|
|
y = y * sw[:, np.newaxis]
|
|
K *= np.outer(sw, sw)
|
|
|
|
if one_alpha:
|
|
# Only one penalty, we can solve multi-target problems in one time.
|
|
K.flat[::n_samples + 1] += alpha[0]
|
|
|
|
try:
|
|
# Note: we must use overwrite_a=False in order to be able to
|
|
# use the fall-back solution below in case a LinAlgError
|
|
# is raised
|
|
dual_coef = linalg.solve(K, y, sym_pos=True,
|
|
overwrite_a=False)
|
|
except np.linalg.LinAlgError:
|
|
warnings.warn("Singular matrix in solving dual problem. Using "
|
|
"least-squares solution instead.")
|
|
dual_coef = linalg.lstsq(K, y)[0]
|
|
|
|
# K is expensive to compute and store in memory so change it back in
|
|
# case it was user-given.
|
|
K.flat[::n_samples + 1] -= alpha[0]
|
|
|
|
if has_sw:
|
|
dual_coef *= sw[:, np.newaxis]
|
|
|
|
return dual_coef
|
|
else:
|
|
# One penalty per target. We need to solve each target separately.
|
|
dual_coefs = np.empty([n_targets, n_samples], K.dtype)
|
|
|
|
for dual_coef, target, current_alpha in zip(dual_coefs, y.T, alpha):
|
|
K.flat[::n_samples + 1] += current_alpha
|
|
|
|
dual_coef[:] = linalg.solve(K, target, sym_pos=True,
|
|
overwrite_a=False).ravel()
|
|
|
|
K.flat[::n_samples + 1] -= current_alpha
|
|
|
|
if has_sw:
|
|
dual_coefs *= sw[np.newaxis, :]
|
|
|
|
return dual_coefs.T
|
|
|
|
|
|
def _solve_svd(X, y, alpha):
|
|
U, s, Vt = linalg.svd(X, full_matrices=False)
|
|
idx = s > 1e-15 # same default value as scipy.linalg.pinv
|
|
s_nnz = s[idx][:, np.newaxis]
|
|
UTy = np.dot(U.T, y)
|
|
d = np.zeros((s.size, alpha.size), dtype=X.dtype)
|
|
d[idx] = s_nnz / (s_nnz ** 2 + alpha)
|
|
d_UT_y = d * UTy
|
|
return np.dot(Vt.T, d_UT_y).T
|
|
|
|
|
|
def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
|
|
max_iter=None, tol=1e-3, verbose=0, random_state=None,
|
|
return_n_iter=False, return_intercept=False):
|
|
"""Solve the ridge equation by the method of normal equations.
|
|
|
|
Read more in the :ref:`User Guide <ridge_regression>`.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix, LinearOperator},
|
|
shape = [n_samples, n_features]
|
|
Training data
|
|
|
|
y : array-like, shape = [n_samples] or [n_samples, n_targets]
|
|
Target values
|
|
|
|
alpha : {float, array-like},
|
|
shape = [n_targets] if array-like
|
|
Regularization strength; must be a positive float. Regularization
|
|
improves the conditioning of the problem and reduces the variance of
|
|
the estimates. Larger values specify stronger regularization.
|
|
Alpha corresponds to ``C^-1`` in other linear models such as
|
|
LogisticRegression or LinearSVC. If an array is passed, penalties are
|
|
assumed to be specific to the targets. Hence they must correspond in
|
|
number.
|
|
|
|
sample_weight : float or numpy array of shape [n_samples]
|
|
Individual weights for each sample. If sample_weight is not None and
|
|
solver='auto', the solver will be set to 'cholesky'.
|
|
|
|
.. versionadded:: 0.17
|
|
|
|
solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}
|
|
Solver to use in the computational routines:
|
|
|
|
- 'auto' chooses the solver automatically based on the type of data.
|
|
|
|
- 'svd' uses a Singular Value Decomposition of X to compute the Ridge
|
|
coefficients. More stable for singular matrices than
|
|
'cholesky'.
|
|
|
|
- 'cholesky' uses the standard scipy.linalg.solve function to
|
|
obtain a closed-form solution via a Cholesky decomposition of
|
|
dot(X.T, X)
|
|
|
|
- 'sparse_cg' uses the conjugate gradient solver as found in
|
|
scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
|
|
more appropriate than 'cholesky' for large-scale data
|
|
(possibility to set `tol` and `max_iter`).
|
|
|
|
- 'lsqr' uses the dedicated regularized least-squares routine
|
|
scipy.sparse.linalg.lsqr. It is the fastest but may not be available
|
|
in old scipy versions. It also uses an iterative procedure.
|
|
|
|
- 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
|
|
its improved, unbiased version named SAGA. Both methods also use an
|
|
iterative procedure, and are often faster than other solvers when
|
|
both n_samples and n_features are large. Note that 'sag' and
|
|
'saga' fast convergence is only guaranteed on features with
|
|
approximately the same scale. You can preprocess the data with a
|
|
scaler from sklearn.preprocessing.
|
|
|
|
|
|
All last five solvers support both dense and sparse data. However, only
|
|
'sag' and 'saga' supports sparse input when`fit_intercept` is True.
|
|
|
|
.. versionadded:: 0.17
|
|
Stochastic Average Gradient descent solver.
|
|
.. versionadded:: 0.19
|
|
SAGA solver.
|
|
|
|
max_iter : int, optional
|
|
Maximum number of iterations for conjugate gradient solver.
|
|
For the 'sparse_cg' and 'lsqr' solvers, the default value is determined
|
|
by scipy.sparse.linalg. For 'sag' and saga solver, the default value is
|
|
1000.
|
|
|
|
tol : float
|
|
Precision of the solution.
|
|
|
|
verbose : int
|
|
Verbosity level. Setting verbose > 0 will display additional
|
|
information depending on the solver used.
|
|
|
|
random_state : int, RandomState instance or None, optional, default None
|
|
The seed of the pseudo random number generator to use when shuffling
|
|
the data. If int, random_state is the seed used by the random number
|
|
generator; If RandomState instance, random_state is the random number
|
|
generator; If None, the random number generator is the RandomState
|
|
instance used by `np.random`. Used when ``solver`` == 'sag'.
|
|
|
|
return_n_iter : boolean, default False
|
|
If True, the method also returns `n_iter`, the actual number of
|
|
iteration performed by the solver.
|
|
|
|
.. versionadded:: 0.17
|
|
|
|
return_intercept : boolean, default False
|
|
If True and if X is sparse, the method also returns the intercept,
|
|
and the solver is automatically changed to 'sag'. This is only a
|
|
temporary fix for fitting the intercept with sparse data. For dense
|
|
data, use sklearn.linear_model._preprocess_data before your regression.
|
|
|
|
.. versionadded:: 0.17
|
|
|
|
Returns
|
|
-------
|
|
coef : array, shape = [n_features] or [n_targets, n_features]
|
|
Weight vector(s).
|
|
|
|
n_iter : int, optional
|
|
The actual number of iteration performed by the solver.
|
|
Only returned if `return_n_iter` is True.
|
|
|
|
intercept : float or array, shape = [n_targets]
|
|
The intercept of the model. Only returned if `return_intercept`
|
|
is True and if X is a scipy sparse array.
|
|
|
|
Notes
|
|
-----
|
|
This function won't compute the intercept.
|
|
"""
|
|
if return_intercept and sparse.issparse(X) and solver != 'sag':
|
|
if solver != 'auto':
|
|
warnings.warn("In Ridge, only 'sag' solver can currently fit the "
|
|
"intercept when X is sparse. Solver has been "
|
|
"automatically changed into 'sag'.")
|
|
solver = 'sag'
|
|
|
|
_dtype = [np.float64, np.float32]
|
|
|
|
# SAG needs X and y columns to be C-contiguous and np.float64
|
|
if solver in ['sag', 'saga']:
|
|
X = check_array(X, accept_sparse=['csr'],
|
|
dtype=np.float64, order='C')
|
|
y = check_array(y, dtype=np.float64, ensure_2d=False, order='F')
|
|
else:
|
|
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
|
|
dtype=_dtype)
|
|
y = check_array(y, dtype=X.dtype, ensure_2d=False)
|
|
check_consistent_length(X, y)
|
|
|
|
n_samples, n_features = X.shape
|
|
|
|
if y.ndim > 2:
|
|
raise ValueError("Target y has the wrong shape %s" % str(y.shape))
|
|
|
|
ravel = False
|
|
if y.ndim == 1:
|
|
y = y.reshape(-1, 1)
|
|
ravel = True
|
|
|
|
n_samples_, n_targets = y.shape
|
|
|
|
if n_samples != n_samples_:
|
|
raise ValueError("Number of samples in X and y does not correspond:"
|
|
" %d != %d" % (n_samples, n_samples_))
|
|
|
|
has_sw = sample_weight is not None
|
|
|
|
if solver == 'auto':
|
|
# cholesky if it's a dense array and cg in any other case
|
|
if not sparse.issparse(X) or has_sw:
|
|
solver = 'cholesky'
|
|
else:
|
|
solver = 'sparse_cg'
|
|
|
|
elif solver == 'lsqr' and not hasattr(sp_linalg, 'lsqr'):
|
|
warnings.warn("""lsqr not available on this machine, falling back
|
|
to sparse_cg.""")
|
|
solver = 'sparse_cg'
|
|
|
|
if has_sw:
|
|
if np.atleast_1d(sample_weight).ndim > 1:
|
|
raise ValueError("Sample weights must be 1D array or scalar")
|
|
|
|
if solver not in ['sag', 'saga']:
|
|
# SAG supports sample_weight directly. For other solvers,
|
|
# we implement sample_weight via a simple rescaling.
|
|
X, y = _rescale_data(X, y, sample_weight)
|
|
|
|
# There should be either 1 or n_targets penalties
|
|
alpha = np.asarray(alpha, dtype=X.dtype).ravel()
|
|
if alpha.size not in [1, n_targets]:
|
|
raise ValueError("Number of targets and number of penalties "
|
|
"do not correspond: %d != %d"
|
|
% (alpha.size, n_targets))
|
|
|
|
if alpha.size == 1 and n_targets > 1:
|
|
alpha = np.repeat(alpha, n_targets)
|
|
|
|
if solver not in ('sparse_cg', 'cholesky', 'svd', 'lsqr', 'sag', 'saga'):
|
|
raise ValueError('Solver %s not understood' % solver)
|
|
|
|
n_iter = None
|
|
if solver == 'sparse_cg':
|
|
coef = _solve_sparse_cg(X, y, alpha, max_iter, tol, verbose)
|
|
|
|
elif solver == 'lsqr':
|
|
coef, n_iter = _solve_lsqr(X, y, alpha, max_iter, tol)
|
|
|
|
elif solver == 'cholesky':
|
|
if n_features > n_samples:
|
|
K = safe_sparse_dot(X, X.T, dense_output=True)
|
|
try:
|
|
dual_coef = _solve_cholesky_kernel(K, y, alpha)
|
|
|
|
coef = safe_sparse_dot(X.T, dual_coef, dense_output=True).T
|
|
except linalg.LinAlgError:
|
|
# use SVD solver if matrix is singular
|
|
solver = 'svd'
|
|
|
|
else:
|
|
try:
|
|
coef = _solve_cholesky(X, y, alpha)
|
|
except linalg.LinAlgError:
|
|
# use SVD solver if matrix is singular
|
|
solver = 'svd'
|
|
|
|
elif solver in ['sag', 'saga']:
|
|
# precompute max_squared_sum for all targets
|
|
max_squared_sum = row_norms(X, squared=True).max()
|
|
|
|
coef = np.empty((y.shape[1], n_features))
|
|
n_iter = np.empty(y.shape[1], dtype=np.int32)
|
|
intercept = np.zeros((y.shape[1], ))
|
|
for i, (alpha_i, target) in enumerate(zip(alpha, y.T)):
|
|
init = {'coef': np.zeros((n_features + int(return_intercept), 1))}
|
|
coef_, n_iter_, _ = sag_solver(
|
|
X, target.ravel(), sample_weight, 'squared', alpha_i, 0,
|
|
max_iter, tol, verbose, random_state, False, max_squared_sum,
|
|
init,
|
|
is_saga=solver == 'saga')
|
|
if return_intercept:
|
|
coef[i] = coef_[:-1]
|
|
intercept[i] = coef_[-1]
|
|
else:
|
|
coef[i] = coef_
|
|
n_iter[i] = n_iter_
|
|
|
|
if intercept.shape[0] == 1:
|
|
intercept = intercept[0]
|
|
coef = np.asarray(coef)
|
|
|
|
if solver == 'svd':
|
|
if sparse.issparse(X):
|
|
raise TypeError('SVD solver does not support sparse'
|
|
' inputs currently')
|
|
coef = _solve_svd(X, y, alpha)
|
|
|
|
if ravel:
|
|
# When y was passed as a 1d-array, we flatten the coefficients.
|
|
coef = coef.ravel()
|
|
|
|
if return_n_iter and return_intercept:
|
|
return coef, n_iter, intercept
|
|
elif return_intercept:
|
|
return coef, intercept
|
|
elif return_n_iter:
|
|
return coef, n_iter
|
|
else:
|
|
return coef
|
|
|
|
|
|
class _BaseRidge(six.with_metaclass(ABCMeta, LinearModel)):
|
|
|
|
@abstractmethod
|
|
def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
|
|
copy_X=True, max_iter=None, tol=1e-3, solver="auto",
|
|
random_state=None):
|
|
self.alpha = alpha
|
|
self.fit_intercept = fit_intercept
|
|
self.normalize = normalize
|
|
self.copy_X = copy_X
|
|
self.max_iter = max_iter
|
|
self.tol = tol
|
|
self.solver = solver
|
|
self.random_state = random_state
|
|
|
|
def fit(self, X, y, sample_weight=None):
|
|
|
|
if self.solver in ('sag', 'saga'):
|
|
_dtype = np.float64
|
|
else:
|
|
# all other solvers work at both float precision levels
|
|
_dtype = [np.float64, np.float32]
|
|
|
|
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=_dtype,
|
|
multi_output=True, y_numeric=True)
|
|
|
|
if ((sample_weight is not None) and
|
|
np.atleast_1d(sample_weight).ndim > 1):
|
|
raise ValueError("Sample weights must be 1D array or scalar")
|
|
|
|
X, y, X_offset, y_offset, X_scale = self._preprocess_data(
|
|
X, y, self.fit_intercept, self.normalize, self.copy_X,
|
|
sample_weight=sample_weight)
|
|
|
|
# temporary fix for fitting the intercept with sparse data using 'sag'
|
|
if sparse.issparse(X) and self.fit_intercept:
|
|
self.coef_, self.n_iter_, self.intercept_ = ridge_regression(
|
|
X, y, alpha=self.alpha, sample_weight=sample_weight,
|
|
max_iter=self.max_iter, tol=self.tol, solver=self.solver,
|
|
random_state=self.random_state, return_n_iter=True,
|
|
return_intercept=True)
|
|
self.intercept_ += y_offset
|
|
else:
|
|
self.coef_, self.n_iter_ = ridge_regression(
|
|
X, y, alpha=self.alpha, sample_weight=sample_weight,
|
|
max_iter=self.max_iter, tol=self.tol, solver=self.solver,
|
|
random_state=self.random_state, return_n_iter=True,
|
|
return_intercept=False)
|
|
self._set_intercept(X_offset, y_offset, X_scale)
|
|
|
|
return self
|
|
|
|
|
|
class Ridge(_BaseRidge, RegressorMixin):
|
|
"""Linear least squares with l2 regularization.
|
|
|
|
This model solves a regression model where the loss function is
|
|
the linear least squares function and regularization is given by
|
|
the l2-norm. Also known as Ridge Regression or Tikhonov regularization.
|
|
This estimator has built-in support for multi-variate regression
|
|
(i.e., when y is a 2d-array of shape [n_samples, n_targets]).
|
|
|
|
Read more in the :ref:`User Guide <ridge_regression>`.
|
|
|
|
Parameters
|
|
----------
|
|
alpha : {float, array-like}, shape (n_targets)
|
|
Regularization strength; must be a positive float. Regularization
|
|
improves the conditioning of the problem and reduces the variance of
|
|
the estimates. Larger values specify stronger regularization.
|
|
Alpha corresponds to ``C^-1`` in other linear models such as
|
|
LogisticRegression or LinearSVC. If an array is passed, penalties are
|
|
assumed to be specific to the targets. Hence they must correspond in
|
|
number.
|
|
|
|
fit_intercept : boolean
|
|
Whether to calculate the intercept for this model. If set
|
|
to false, no intercept will be used in calculations
|
|
(e.g. data is expected to be already centered).
|
|
|
|
normalize : boolean, optional, default False
|
|
This parameter is ignored when ``fit_intercept`` is set to False.
|
|
If True, the regressors X will be normalized before regression by
|
|
subtracting the mean and dividing by the l2-norm.
|
|
If you wish to standardize, please use
|
|
:class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
|
|
on an estimator with ``normalize=False``.
|
|
|
|
copy_X : boolean, optional, default True
|
|
If True, X will be copied; else, it may be overwritten.
|
|
|
|
max_iter : int, optional
|
|
Maximum number of iterations for conjugate gradient solver.
|
|
For 'sparse_cg' and 'lsqr' solvers, the default value is determined
|
|
by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.
|
|
|
|
tol : float
|
|
Precision of the solution.
|
|
|
|
solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}
|
|
Solver to use in the computational routines:
|
|
|
|
- 'auto' chooses the solver automatically based on the type of data.
|
|
|
|
- 'svd' uses a Singular Value Decomposition of X to compute the Ridge
|
|
coefficients. More stable for singular matrices than
|
|
'cholesky'.
|
|
|
|
- 'cholesky' uses the standard scipy.linalg.solve function to
|
|
obtain a closed-form solution.
|
|
|
|
- 'sparse_cg' uses the conjugate gradient solver as found in
|
|
scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
|
|
more appropriate than 'cholesky' for large-scale data
|
|
(possibility to set `tol` and `max_iter`).
|
|
|
|
- 'lsqr' uses the dedicated regularized least-squares routine
|
|
scipy.sparse.linalg.lsqr. It is the fastest but may not be available
|
|
in old scipy versions. It also uses an iterative procedure.
|
|
|
|
- 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
|
|
its improved, unbiased version named SAGA. Both methods also use an
|
|
iterative procedure, and are often faster than other solvers when
|
|
both n_samples and n_features are large. Note that 'sag' and
|
|
'saga' fast convergence is only guaranteed on features with
|
|
approximately the same scale. You can preprocess the data with a
|
|
scaler from sklearn.preprocessing.
|
|
|
|
All last five solvers support both dense and sparse data. However,
|
|
only 'sag' and 'saga' supports sparse input when `fit_intercept` is
|
|
True.
|
|
|
|
.. versionadded:: 0.17
|
|
Stochastic Average Gradient descent solver.
|
|
.. versionadded:: 0.19
|
|
SAGA solver.
|
|
|
|
random_state : int, RandomState instance or None, optional, default None
|
|
The seed of the pseudo random number generator to use when shuffling
|
|
the data. If int, random_state is the seed used by the random number
|
|
generator; If RandomState instance, random_state is the random number
|
|
generator; If None, the random number generator is the RandomState
|
|
instance used by `np.random`. Used when ``solver`` == 'sag'.
|
|
|
|
.. versionadded:: 0.17
|
|
*random_state* to support Stochastic Average Gradient.
|
|
|
|
Attributes
|
|
----------
|
|
coef_ : array, shape (n_features,) or (n_targets, n_features)
|
|
Weight vector(s).
|
|
|
|
intercept_ : float | array, shape = (n_targets,)
|
|
Independent term in decision function. Set to 0.0 if
|
|
``fit_intercept = False``.
|
|
|
|
n_iter_ : array or None, shape (n_targets,)
|
|
Actual number of iterations for each target. Available only for
|
|
sag and lsqr solvers. Other solvers will return None.
|
|
|
|
.. versionadded:: 0.17
|
|
|
|
See also
|
|
--------
|
|
RidgeClassifier, RidgeCV, :class:`sklearn.kernel_ridge.KernelRidge`
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.linear_model import Ridge
|
|
>>> import numpy as np
|
|
>>> n_samples, n_features = 10, 5
|
|
>>> np.random.seed(0)
|
|
>>> y = np.random.randn(n_samples)
|
|
>>> X = np.random.randn(n_samples, n_features)
|
|
>>> clf = Ridge(alpha=1.0)
|
|
>>> clf.fit(X, y) # doctest: +NORMALIZE_WHITESPACE
|
|
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
|
|
normalize=False, random_state=None, solver='auto', tol=0.001)
|
|
|
|
"""
|
|
def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
|
|
copy_X=True, max_iter=None, tol=1e-3, solver="auto",
|
|
random_state=None):
|
|
super(Ridge, self).__init__(alpha=alpha, fit_intercept=fit_intercept,
|
|
normalize=normalize, copy_X=copy_X,
|
|
max_iter=max_iter, tol=tol, solver=solver,
|
|
random_state=random_state)
|
|
|
|
def fit(self, X, y, sample_weight=None):
|
|
"""Fit Ridge regression model
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
|
|
Training data
|
|
|
|
y : array-like, shape = [n_samples] or [n_samples, n_targets]
|
|
Target values
|
|
|
|
sample_weight : float or numpy array of shape [n_samples]
|
|
Individual weights for each sample
|
|
|
|
Returns
|
|
-------
|
|
self : returns an instance of self.
|
|
"""
|
|
return super(Ridge, self).fit(X, y, sample_weight=sample_weight)
|
|
|
|
|
|
class RidgeClassifier(LinearClassifierMixin, _BaseRidge):
|
|
"""Classifier using Ridge regression.
|
|
|
|
Read more in the :ref:`User Guide <ridge_regression>`.
|
|
|
|
Parameters
|
|
----------
|
|
alpha : float
|
|
Regularization strength; must be a positive float. Regularization
|
|
improves the conditioning of the problem and reduces the variance of
|
|
the estimates. Larger values specify stronger regularization.
|
|
Alpha corresponds to ``C^-1`` in other linear models such as
|
|
LogisticRegression or LinearSVC.
|
|
|
|
fit_intercept : boolean
|
|
Whether to calculate the intercept for this model. If set to false, no
|
|
intercept will be used in calculations (e.g. data is expected to be
|
|
already centered).
|
|
|
|
normalize : boolean, optional, default False
|
|
This parameter is ignored when ``fit_intercept`` is set to False.
|
|
If True, the regressors X will be normalized before regression by
|
|
subtracting the mean and dividing by the l2-norm.
|
|
If you wish to standardize, please use
|
|
:class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
|
|
on an estimator with ``normalize=False``.
|
|
|
|
copy_X : boolean, optional, default True
|
|
If True, X will be copied; else, it may be overwritten.
|
|
|
|
max_iter : int, optional
|
|
Maximum number of iterations for conjugate gradient solver.
|
|
The default value is determined by scipy.sparse.linalg.
|
|
|
|
tol : float
|
|
Precision of the solution.
|
|
|
|
class_weight : dict or 'balanced', optional
|
|
Weights associated with classes in the form ``{class_label: weight}``.
|
|
If not given, all classes are supposed to have weight one.
|
|
|
|
The "balanced" mode uses the values of y to automatically adjust
|
|
weights inversely proportional to class frequencies in the input data
|
|
as ``n_samples / (n_classes * np.bincount(y))``
|
|
|
|
solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'}
|
|
Solver to use in the computational routines:
|
|
|
|
- 'auto' chooses the solver automatically based on the type of data.
|
|
|
|
- 'svd' uses a Singular Value Decomposition of X to compute the Ridge
|
|
coefficients. More stable for singular matrices than
|
|
'cholesky'.
|
|
|
|
- 'cholesky' uses the standard scipy.linalg.solve function to
|
|
obtain a closed-form solution.
|
|
|
|
- 'sparse_cg' uses the conjugate gradient solver as found in
|
|
scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
|
|
more appropriate than 'cholesky' for large-scale data
|
|
(possibility to set `tol` and `max_iter`).
|
|
|
|
- 'lsqr' uses the dedicated regularized least-squares routine
|
|
scipy.sparse.linalg.lsqr. It is the fastest but may not be available
|
|
in old scipy versions. It also uses an iterative procedure.
|
|
|
|
- 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
|
|
its unbiased and more flexible version named SAGA. Both methods
|
|
use an iterative procedure, and are often faster than other solvers
|
|
when both n_samples and n_features are large. Note that 'sag' and
|
|
'saga' fast convergence is only guaranteed on features with
|
|
approximately the same scale. You can preprocess the data with a
|
|
scaler from sklearn.preprocessing.
|
|
|
|
.. versionadded:: 0.17
|
|
Stochastic Average Gradient descent solver.
|
|
.. versionadded:: 0.19
|
|
SAGA solver.
|
|
|
|
random_state : int, RandomState instance or None, optional, default None
|
|
The seed of the pseudo random number generator to use when shuffling
|
|
the data. If int, random_state is the seed used by the random number
|
|
generator; If RandomState instance, random_state is the random number
|
|
generator; If None, the random number generator is the RandomState
|
|
instance used by `np.random`. Used when ``solver`` == 'sag'.
|
|
|
|
Attributes
|
|
----------
|
|
coef_ : array, shape (n_features,) or (n_classes, n_features)
|
|
Weight vector(s).
|
|
|
|
intercept_ : float | array, shape = (n_targets,)
|
|
Independent term in decision function. Set to 0.0 if
|
|
``fit_intercept = False``.
|
|
|
|
n_iter_ : array or None, shape (n_targets,)
|
|
Actual number of iterations for each target. Available only for
|
|
sag and lsqr solvers. Other solvers will return None.
|
|
|
|
See also
|
|
--------
|
|
Ridge, RidgeClassifierCV
|
|
|
|
Notes
|
|
-----
|
|
For multi-class classification, n_class classifiers are trained in
|
|
a one-versus-all approach. Concretely, this is implemented by taking
|
|
advantage of the multi-variate response support in Ridge.
|
|
"""
|
|
def __init__(self, alpha=1.0, fit_intercept=True, normalize=False,
|
|
copy_X=True, max_iter=None, tol=1e-3, class_weight=None,
|
|
solver="auto", random_state=None):
|
|
super(RidgeClassifier, self).__init__(
|
|
alpha=alpha, fit_intercept=fit_intercept, normalize=normalize,
|
|
copy_X=copy_X, max_iter=max_iter, tol=tol, solver=solver,
|
|
random_state=random_state)
|
|
self.class_weight = class_weight
|
|
|
|
def fit(self, X, y, sample_weight=None):
|
|
"""Fit Ridge regression model.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}, shape = [n_samples,n_features]
|
|
Training data
|
|
|
|
y : array-like, shape = [n_samples]
|
|
Target values
|
|
|
|
sample_weight : float or numpy array of shape (n_samples,)
|
|
Sample weight.
|
|
|
|
.. versionadded:: 0.17
|
|
*sample_weight* support to Classifier.
|
|
|
|
Returns
|
|
-------
|
|
self : returns an instance of self.
|
|
"""
|
|
self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
|
|
Y = self._label_binarizer.fit_transform(y)
|
|
if not self._label_binarizer.y_type_.startswith('multilabel'):
|
|
y = column_or_1d(y, warn=True)
|
|
else:
|
|
# we don't (yet) support multi-label classification in Ridge
|
|
raise ValueError(
|
|
"%s doesn't support multi-label classification" % (
|
|
self.__class__.__name__))
|
|
|
|
if self.class_weight:
|
|
if sample_weight is None:
|
|
sample_weight = 1.
|
|
# modify the sample weights with the corresponding class weight
|
|
sample_weight = (sample_weight *
|
|
compute_sample_weight(self.class_weight, y))
|
|
|
|
super(RidgeClassifier, self).fit(X, Y, sample_weight=sample_weight)
|
|
return self
|
|
|
|
@property
|
|
def classes_(self):
|
|
return self._label_binarizer.classes_
|
|
|
|
|
|
class _RidgeGCV(LinearModel):
|
|
"""Ridge regression with built-in Generalized Cross-Validation
|
|
|
|
It allows efficient Leave-One-Out cross-validation.
|
|
|
|
This class is not intended to be used directly. Use RidgeCV instead.
|
|
|
|
Notes
|
|
-----
|
|
|
|
We want to solve (K + alpha*Id)c = y,
|
|
where K = X X^T is the kernel matrix.
|
|
|
|
Let G = (K + alpha*Id)^-1.
|
|
|
|
Dual solution: c = Gy
|
|
Primal solution: w = X^T c
|
|
|
|
Compute eigendecomposition K = Q V Q^T.
|
|
Then G = Q (V + alpha*Id)^-1 Q^T,
|
|
where (V + alpha*Id) is diagonal.
|
|
It is thus inexpensive to inverse for many alphas.
|
|
|
|
Let loov be the vector of prediction values for each example
|
|
when the model was fitted with all examples but this example.
|
|
|
|
loov = (KGY - diag(KG)Y) / diag(I-KG)
|
|
|
|
Let looe be the vector of prediction errors for each example
|
|
when the model was fitted with all examples but this example.
|
|
|
|
looe = y - loov = c / diag(G)
|
|
|
|
References
|
|
----------
|
|
http://cbcl.mit.edu/projects/cbcl/publications/ps/MIT-CSAIL-TR-2007-025.pdf
|
|
http://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf
|
|
"""
|
|
|
|
def __init__(self, alphas=(0.1, 1.0, 10.0),
|
|
fit_intercept=True, normalize=False,
|
|
scoring=None, copy_X=True,
|
|
gcv_mode=None, store_cv_values=False):
|
|
self.alphas = np.asarray(alphas)
|
|
self.fit_intercept = fit_intercept
|
|
self.normalize = normalize
|
|
self.scoring = scoring
|
|
self.copy_X = copy_X
|
|
self.gcv_mode = gcv_mode
|
|
self.store_cv_values = store_cv_values
|
|
|
|
def _pre_compute(self, X, y, centered_kernel=True):
|
|
# even if X is very sparse, K is usually very dense
|
|
K = safe_sparse_dot(X, X.T, dense_output=True)
|
|
# the following emulates an additional constant regressor
|
|
# corresponding to fit_intercept=True
|
|
# but this is done only when the features have been centered
|
|
if centered_kernel:
|
|
K += np.ones_like(K)
|
|
v, Q = linalg.eigh(K)
|
|
QT_y = np.dot(Q.T, y)
|
|
return v, Q, QT_y
|
|
|
|
def _decomp_diag(self, v_prime, Q):
|
|
# compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T))
|
|
return (v_prime * Q ** 2).sum(axis=-1)
|
|
|
|
def _diag_dot(self, D, B):
|
|
# compute dot(diag(D), B)
|
|
if len(B.shape) > 1:
|
|
# handle case where B is > 1-d
|
|
D = D[(slice(None), ) + (np.newaxis, ) * (len(B.shape) - 1)]
|
|
return D * B
|
|
|
|
def _errors_and_values_helper(self, alpha, y, v, Q, QT_y):
|
|
"""Helper function to avoid code duplication between self._errors and
|
|
self._values.
|
|
|
|
Notes
|
|
-----
|
|
We don't construct matrix G, instead compute action on y & diagonal.
|
|
"""
|
|
w = 1. / (v + alpha)
|
|
constant_column = np.var(Q, 0) < 1.e-12
|
|
# detect constant columns
|
|
w[constant_column] = 0 # cancel the regularization for the intercept
|
|
|
|
c = np.dot(Q, self._diag_dot(w, QT_y))
|
|
G_diag = self._decomp_diag(w, Q)
|
|
# handle case where y is 2-d
|
|
if len(y.shape) != 1:
|
|
G_diag = G_diag[:, np.newaxis]
|
|
return G_diag, c
|
|
|
|
def _errors(self, alpha, y, v, Q, QT_y):
|
|
G_diag, c = self._errors_and_values_helper(alpha, y, v, Q, QT_y)
|
|
return (c / G_diag) ** 2, c
|
|
|
|
def _values(self, alpha, y, v, Q, QT_y):
|
|
G_diag, c = self._errors_and_values_helper(alpha, y, v, Q, QT_y)
|
|
return y - (c / G_diag), c
|
|
|
|
def _pre_compute_svd(self, X, y, centered_kernel=True):
|
|
if sparse.issparse(X):
|
|
raise TypeError("SVD not supported for sparse matrices")
|
|
if centered_kernel:
|
|
X = np.hstack((X, np.ones((X.shape[0], 1))))
|
|
# to emulate fit_intercept=True situation, add a column on ones
|
|
# Note that by centering, the other columns are orthogonal to that one
|
|
U, s, _ = linalg.svd(X, full_matrices=0)
|
|
v = s ** 2
|
|
UT_y = np.dot(U.T, y)
|
|
return v, U, UT_y
|
|
|
|
def _errors_and_values_svd_helper(self, alpha, y, v, U, UT_y):
|
|
"""Helper function to avoid code duplication between self._errors_svd
|
|
and self._values_svd.
|
|
"""
|
|
constant_column = np.var(U, 0) < 1.e-12
|
|
# detect columns colinear to ones
|
|
w = ((v + alpha) ** -1) - (alpha ** -1)
|
|
w[constant_column] = - (alpha ** -1)
|
|
# cancel the regularization for the intercept
|
|
c = np.dot(U, self._diag_dot(w, UT_y)) + (alpha ** -1) * y
|
|
G_diag = self._decomp_diag(w, U) + (alpha ** -1)
|
|
if len(y.shape) != 1:
|
|
# handle case where y is 2-d
|
|
G_diag = G_diag[:, np.newaxis]
|
|
return G_diag, c
|
|
|
|
def _errors_svd(self, alpha, y, v, U, UT_y):
|
|
G_diag, c = self._errors_and_values_svd_helper(alpha, y, v, U, UT_y)
|
|
return (c / G_diag) ** 2, c
|
|
|
|
def _values_svd(self, alpha, y, v, U, UT_y):
|
|
G_diag, c = self._errors_and_values_svd_helper(alpha, y, v, U, UT_y)
|
|
return y - (c / G_diag), c
|
|
|
|
def fit(self, X, y, sample_weight=None):
|
|
"""Fit Ridge regression model
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
|
|
Training data
|
|
|
|
y : array-like, shape = [n_samples] or [n_samples, n_targets]
|
|
Target values. Will be cast to X's dtype if necessary
|
|
|
|
sample_weight : float or array-like of shape [n_samples]
|
|
Sample weight
|
|
|
|
Returns
|
|
-------
|
|
self : Returns self.
|
|
"""
|
|
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=np.float64,
|
|
multi_output=True, y_numeric=True)
|
|
if sample_weight is not None and not isinstance(sample_weight, float):
|
|
sample_weight = check_array(sample_weight, ensure_2d=False)
|
|
n_samples, n_features = X.shape
|
|
|
|
X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data(
|
|
X, y, self.fit_intercept, self.normalize, self.copy_X,
|
|
sample_weight=sample_weight)
|
|
|
|
gcv_mode = self.gcv_mode
|
|
with_sw = len(np.shape(sample_weight))
|
|
|
|
if gcv_mode is None or gcv_mode == 'auto':
|
|
if sparse.issparse(X) or n_features > n_samples or with_sw:
|
|
gcv_mode = 'eigen'
|
|
else:
|
|
gcv_mode = 'svd'
|
|
elif gcv_mode == "svd" and with_sw:
|
|
# FIXME non-uniform sample weights not yet supported
|
|
warnings.warn("non-uniform sample weights unsupported for svd, "
|
|
"forcing usage of eigen")
|
|
gcv_mode = 'eigen'
|
|
|
|
if gcv_mode == 'eigen':
|
|
_pre_compute = self._pre_compute
|
|
_errors = self._errors
|
|
_values = self._values
|
|
elif gcv_mode == 'svd':
|
|
# assert n_samples >= n_features
|
|
_pre_compute = self._pre_compute_svd
|
|
_errors = self._errors_svd
|
|
_values = self._values_svd
|
|
else:
|
|
raise ValueError('bad gcv_mode "%s"' % gcv_mode)
|
|
|
|
if sample_weight is not None:
|
|
X, y = _rescale_data(X, y, sample_weight)
|
|
|
|
centered_kernel = not sparse.issparse(X) and self.fit_intercept
|
|
|
|
v, Q, QT_y = _pre_compute(X, y, centered_kernel)
|
|
n_y = 1 if len(y.shape) == 1 else y.shape[1]
|
|
cv_values = np.zeros((n_samples * n_y, len(self.alphas)))
|
|
C = []
|
|
|
|
scorer = check_scoring(self, scoring=self.scoring, allow_none=True)
|
|
error = scorer is None
|
|
|
|
for i, alpha in enumerate(self.alphas):
|
|
if error:
|
|
out, c = _errors(alpha, y, v, Q, QT_y)
|
|
else:
|
|
out, c = _values(alpha, y, v, Q, QT_y)
|
|
cv_values[:, i] = out.ravel()
|
|
C.append(c)
|
|
|
|
if error:
|
|
best = cv_values.mean(axis=0).argmin()
|
|
else:
|
|
# The scorer want an object that will make the predictions but
|
|
# they are already computed efficiently by _RidgeGCV. This
|
|
# identity_estimator will just return them
|
|
def identity_estimator():
|
|
pass
|
|
identity_estimator.decision_function = lambda y_predict: y_predict
|
|
identity_estimator.predict = lambda y_predict: y_predict
|
|
|
|
out = [scorer(identity_estimator, y.ravel(), cv_values[:, i])
|
|
for i in range(len(self.alphas))]
|
|
best = np.argmax(out)
|
|
|
|
self.alpha_ = self.alphas[best]
|
|
self.dual_coef_ = C[best]
|
|
self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)
|
|
|
|
self._set_intercept(X_offset, y_offset, X_scale)
|
|
|
|
if self.store_cv_values:
|
|
if len(y.shape) == 1:
|
|
cv_values_shape = n_samples, len(self.alphas)
|
|
else:
|
|
cv_values_shape = n_samples, n_y, len(self.alphas)
|
|
self.cv_values_ = cv_values.reshape(cv_values_shape)
|
|
|
|
return self
|
|
|
|
|
|
class _BaseRidgeCV(LinearModel):
|
|
def __init__(self, alphas=(0.1, 1.0, 10.0),
|
|
fit_intercept=True, normalize=False, scoring=None,
|
|
cv=None, gcv_mode=None,
|
|
store_cv_values=False):
|
|
self.alphas = alphas
|
|
self.fit_intercept = fit_intercept
|
|
self.normalize = normalize
|
|
self.scoring = scoring
|
|
self.cv = cv
|
|
self.gcv_mode = gcv_mode
|
|
self.store_cv_values = store_cv_values
|
|
|
|
def fit(self, X, y, sample_weight=None):
|
|
"""Fit Ridge regression model
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape = [n_samples, n_features]
|
|
Training data
|
|
|
|
y : array-like, shape = [n_samples] or [n_samples, n_targets]
|
|
Target values. Will be cast to X's dtype if necessary
|
|
|
|
sample_weight : float or array-like of shape [n_samples]
|
|
Sample weight
|
|
|
|
Returns
|
|
-------
|
|
self : Returns self.
|
|
"""
|
|
if self.cv is None:
|
|
estimator = _RidgeGCV(self.alphas,
|
|
fit_intercept=self.fit_intercept,
|
|
normalize=self.normalize,
|
|
scoring=self.scoring,
|
|
gcv_mode=self.gcv_mode,
|
|
store_cv_values=self.store_cv_values)
|
|
estimator.fit(X, y, sample_weight=sample_weight)
|
|
self.alpha_ = estimator.alpha_
|
|
if self.store_cv_values:
|
|
self.cv_values_ = estimator.cv_values_
|
|
else:
|
|
if self.store_cv_values:
|
|
raise ValueError("cv!=None and store_cv_values=True "
|
|
" are incompatible")
|
|
parameters = {'alpha': self.alphas}
|
|
gs = GridSearchCV(Ridge(fit_intercept=self.fit_intercept,
|
|
normalize=self.normalize),
|
|
parameters, cv=self.cv, scoring=self.scoring)
|
|
gs.fit(X, y, sample_weight=sample_weight)
|
|
estimator = gs.best_estimator_
|
|
self.alpha_ = gs.best_estimator_.alpha
|
|
|
|
self.coef_ = estimator.coef_
|
|
self.intercept_ = estimator.intercept_
|
|
|
|
return self
|
|
|
|
|
|
class RidgeCV(_BaseRidgeCV, RegressorMixin):
|
|
"""Ridge regression with built-in cross-validation.
|
|
|
|
By default, it performs Generalized Cross-Validation, which is a form of
|
|
efficient Leave-One-Out cross-validation.
|
|
|
|
Read more in the :ref:`User Guide <ridge_regression>`.
|
|
|
|
Parameters
|
|
----------
|
|
alphas : numpy array of shape [n_alphas]
|
|
Array of alpha values to try.
|
|
Regularization strength; must be a positive float. Regularization
|
|
improves the conditioning of the problem and reduces the variance of
|
|
the estimates. Larger values specify stronger regularization.
|
|
Alpha corresponds to ``C^-1`` in other linear models such as
|
|
LogisticRegression or LinearSVC.
|
|
|
|
fit_intercept : boolean
|
|
Whether to calculate the intercept for this model. If set
|
|
to false, no intercept will be used in calculations
|
|
(e.g. data is expected to be already centered).
|
|
|
|
normalize : boolean, optional, default False
|
|
This parameter is ignored when ``fit_intercept`` is set to False.
|
|
If True, the regressors X will be normalized before regression by
|
|
subtracting the mean and dividing by the l2-norm.
|
|
If you wish to standardize, please use
|
|
:class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
|
|
on an estimator with ``normalize=False``.
|
|
|
|
scoring : string, callable or None, optional, default: None
|
|
A string (see model evaluation documentation) or
|
|
a scorer callable object / function with signature
|
|
``scorer(estimator, X, y)``.
|
|
|
|
cv : int, cross-validation generator or an iterable, optional
|
|
Determines the cross-validation splitting strategy.
|
|
Possible inputs for cv are:
|
|
|
|
- None, to use the efficient Leave-One-Out cross-validation
|
|
- integer, to specify the number of folds.
|
|
- An object to be used as a cross-validation generator.
|
|
- An iterable yielding train/test splits.
|
|
|
|
For integer/None inputs, if ``y`` is binary or multiclass,
|
|
:class:`sklearn.model_selection.StratifiedKFold` is used, else,
|
|
:class:`sklearn.model_selection.KFold` is used.
|
|
|
|
Refer :ref:`User Guide <cross_validation>` for the various
|
|
cross-validation strategies that can be used here.
|
|
|
|
gcv_mode : {None, 'auto', 'svd', eigen'}, optional
|
|
Flag indicating which strategy to use when performing
|
|
Generalized Cross-Validation. Options are::
|
|
|
|
'auto' : use svd if n_samples > n_features or when X is a sparse
|
|
matrix, otherwise use eigen
|
|
'svd' : force computation via singular value decomposition of X
|
|
(does not work for sparse matrices)
|
|
'eigen' : force computation via eigendecomposition of X^T X
|
|
|
|
The 'auto' mode is the default and is intended to pick the cheaper
|
|
option of the two depending upon the shape and format of the training
|
|
data.
|
|
|
|
store_cv_values : boolean, default=False
|
|
Flag indicating if the cross-validation values corresponding to
|
|
each alpha should be stored in the `cv_values_` attribute (see
|
|
below). This flag is only compatible with `cv=None` (i.e. using
|
|
Generalized Cross-Validation).
|
|
|
|
Attributes
|
|
----------
|
|
cv_values_ : array, shape = [n_samples, n_alphas] or \
|
|
shape = [n_samples, n_targets, n_alphas], optional
|
|
Cross-validation values for each alpha (if `store_cv_values=True` and \
|
|
`cv=None`). After `fit()` has been called, this attribute will \
|
|
contain the mean squared errors (by default) or the values of the \
|
|
`{loss,score}_func` function (if provided in the constructor).
|
|
|
|
coef_ : array, shape = [n_features] or [n_targets, n_features]
|
|
Weight vector(s).
|
|
|
|
intercept_ : float | array, shape = (n_targets,)
|
|
Independent term in decision function. Set to 0.0 if
|
|
``fit_intercept = False``.
|
|
|
|
alpha_ : float
|
|
Estimated regularization parameter.
|
|
|
|
See also
|
|
--------
|
|
Ridge: Ridge regression
|
|
RidgeClassifier: Ridge classifier
|
|
RidgeClassifierCV: Ridge classifier with built-in cross validation
|
|
"""
|
|
pass
|
|
|
|
|
|
class RidgeClassifierCV(LinearClassifierMixin, _BaseRidgeCV):
|
|
"""Ridge classifier with built-in cross-validation.
|
|
|
|
By default, it performs Generalized Cross-Validation, which is a form of
|
|
efficient Leave-One-Out cross-validation. Currently, only the n_features >
|
|
n_samples case is handled efficiently.
|
|
|
|
Read more in the :ref:`User Guide <ridge_regression>`.
|
|
|
|
Parameters
|
|
----------
|
|
alphas : numpy array of shape [n_alphas]
|
|
Array of alpha values to try.
|
|
Regularization strength; must be a positive float. Regularization
|
|
improves the conditioning of the problem and reduces the variance of
|
|
the estimates. Larger values specify stronger regularization.
|
|
Alpha corresponds to ``C^-1`` in other linear models such as
|
|
LogisticRegression or LinearSVC.
|
|
|
|
fit_intercept : boolean
|
|
Whether to calculate the intercept for this model. If set
|
|
to false, no intercept will be used in calculations
|
|
(e.g. data is expected to be already centered).
|
|
|
|
normalize : boolean, optional, default False
|
|
This parameter is ignored when ``fit_intercept`` is set to False.
|
|
If True, the regressors X will be normalized before regression by
|
|
subtracting the mean and dividing by the l2-norm.
|
|
If you wish to standardize, please use
|
|
:class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
|
|
on an estimator with ``normalize=False``.
|
|
|
|
scoring : string, callable or None, optional, default: None
|
|
A string (see model evaluation documentation) or
|
|
a scorer callable object / function with signature
|
|
``scorer(estimator, X, y)``.
|
|
|
|
cv : int, cross-validation generator or an iterable, optional
|
|
Determines the cross-validation splitting strategy.
|
|
Possible inputs for cv are:
|
|
|
|
- None, to use the efficient Leave-One-Out cross-validation
|
|
- integer, to specify the number of folds.
|
|
- An object to be used as a cross-validation generator.
|
|
- An iterable yielding train/test splits.
|
|
|
|
Refer :ref:`User Guide <cross_validation>` for the various
|
|
cross-validation strategies that can be used here.
|
|
|
|
class_weight : dict or 'balanced', optional
|
|
Weights associated with classes in the form ``{class_label: weight}``.
|
|
If not given, all classes are supposed to have weight one.
|
|
|
|
The "balanced" mode uses the values of y to automatically adjust
|
|
weights inversely proportional to class frequencies in the input data
|
|
as ``n_samples / (n_classes * np.bincount(y))``
|
|
|
|
Attributes
|
|
----------
|
|
cv_values_ : array, shape = [n_samples, n_alphas] or \
|
|
shape = [n_samples, n_responses, n_alphas], optional
|
|
Cross-validation values for each alpha (if `store_cv_values=True` and
|
|
`cv=None`). After `fit()` has been called, this attribute will contain \
|
|
the mean squared errors (by default) or the values of the \
|
|
`{loss,score}_func` function (if provided in the constructor).
|
|
|
|
coef_ : array, shape = [n_features] or [n_targets, n_features]
|
|
Weight vector(s).
|
|
|
|
intercept_ : float | array, shape = (n_targets,)
|
|
Independent term in decision function. Set to 0.0 if
|
|
``fit_intercept = False``.
|
|
|
|
alpha_ : float
|
|
Estimated regularization parameter
|
|
|
|
See also
|
|
--------
|
|
Ridge: Ridge regression
|
|
RidgeClassifier: Ridge classifier
|
|
RidgeCV: Ridge regression with built-in cross validation
|
|
|
|
Notes
|
|
-----
|
|
For multi-class classification, n_class classifiers are trained in
|
|
a one-versus-all approach. Concretely, this is implemented by taking
|
|
advantage of the multi-variate response support in Ridge.
|
|
"""
|
|
def __init__(self, alphas=(0.1, 1.0, 10.0), fit_intercept=True,
|
|
normalize=False, scoring=None, cv=None, class_weight=None):
|
|
super(RidgeClassifierCV, self).__init__(
|
|
alphas=alphas, fit_intercept=fit_intercept, normalize=normalize,
|
|
scoring=scoring, cv=cv)
|
|
self.class_weight = class_weight
|
|
|
|
def fit(self, X, y, sample_weight=None):
|
|
"""Fit the ridge classifier.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like, shape (n_samples, n_features)
|
|
Training vectors, where n_samples is the number of samples
|
|
and n_features is the number of features.
|
|
|
|
y : array-like, shape (n_samples,)
|
|
Target values. Will be cast to X's dtype if necessary
|
|
|
|
sample_weight : float or numpy array of shape (n_samples,)
|
|
Sample weight.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
Returns self.
|
|
"""
|
|
self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
|
|
Y = self._label_binarizer.fit_transform(y)
|
|
if not self._label_binarizer.y_type_.startswith('multilabel'):
|
|
y = column_or_1d(y, warn=True)
|
|
|
|
if self.class_weight:
|
|
if sample_weight is None:
|
|
sample_weight = 1.
|
|
# modify the sample weights with the corresponding class weight
|
|
sample_weight = (sample_weight *
|
|
compute_sample_weight(self.class_weight, y))
|
|
|
|
_BaseRidgeCV.fit(self, X, Y, sample_weight=sample_weight)
|
|
return self
|
|
|
|
@property
|
|
def classes_(self):
|
|
return self._label_binarizer.classes_
|