1378 lines
57 KiB
Python
1378 lines
57 KiB
Python
|
"""
|
||
|
The :mod:`sklearn.model_selection._search` includes utilities to fine-tune the
|
||
|
parameters of an estimator.
|
||
|
"""
|
||
|
from __future__ import print_function
|
||
|
from __future__ import division
|
||
|
|
||
|
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>,
|
||
|
# Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||
|
# Andreas Mueller <amueller@ais.uni-bonn.de>
|
||
|
# Olivier Grisel <olivier.grisel@ensta.org>
|
||
|
# Raghav RV <rvraghav93@gmail.com>
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
from abc import ABCMeta, abstractmethod
|
||
|
from collections import namedtuple, defaultdict
|
||
|
from functools import partial, reduce
|
||
|
from itertools import product
|
||
|
import operator
|
||
|
import warnings
|
||
|
|
||
|
import numpy as np
|
||
|
from scipy.stats import rankdata
|
||
|
|
||
|
from ..base import BaseEstimator, is_classifier, clone
|
||
|
from ..base import MetaEstimatorMixin
|
||
|
from ._split import check_cv
|
||
|
from ._validation import _fit_and_score
|
||
|
from ._validation import _aggregate_score_dicts
|
||
|
from ..exceptions import NotFittedError
|
||
|
from ..externals.joblib import Parallel, delayed
|
||
|
from ..externals import six
|
||
|
from ..utils import check_random_state
|
||
|
from ..utils.fixes import sp_version
|
||
|
from ..utils.fixes import MaskedArray
|
||
|
from ..utils.fixes import _Mapping as Mapping, _Sequence as Sequence
|
||
|
from ..utils.random import sample_without_replacement
|
||
|
from ..utils.validation import indexable, check_is_fitted
|
||
|
from ..utils.metaestimators import if_delegate_has_method
|
||
|
from ..utils.deprecation import DeprecationDict
|
||
|
from ..metrics.scorer import _check_multimetric_scoring
|
||
|
from ..metrics.scorer import check_scoring
|
||
|
|
||
|
|
||
|
__all__ = ['GridSearchCV', 'ParameterGrid', 'fit_grid_point',
|
||
|
'ParameterSampler', 'RandomizedSearchCV']
|
||
|
|
||
|
|
||
|
class ParameterGrid(object):
|
||
|
"""Grid of parameters with a discrete number of values for each.
|
||
|
|
||
|
Can be used to iterate over parameter value combinations with the
|
||
|
Python built-in function iter.
|
||
|
|
||
|
Read more in the :ref:`User Guide <search>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
param_grid : dict of string to sequence, or sequence of such
|
||
|
The parameter grid to explore, as a dictionary mapping estimator
|
||
|
parameters to sequences of allowed values.
|
||
|
|
||
|
An empty dict signifies default parameters.
|
||
|
|
||
|
A sequence of dicts signifies a sequence of grids to search, and is
|
||
|
useful to avoid exploring parameter combinations that make no sense
|
||
|
or have no effect. See the examples below.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.model_selection import ParameterGrid
|
||
|
>>> param_grid = {'a': [1, 2], 'b': [True, False]}
|
||
|
>>> list(ParameterGrid(param_grid)) == (
|
||
|
... [{'a': 1, 'b': True}, {'a': 1, 'b': False},
|
||
|
... {'a': 2, 'b': True}, {'a': 2, 'b': False}])
|
||
|
True
|
||
|
|
||
|
>>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}]
|
||
|
>>> list(ParameterGrid(grid)) == [{'kernel': 'linear'},
|
||
|
... {'kernel': 'rbf', 'gamma': 1},
|
||
|
... {'kernel': 'rbf', 'gamma': 10}]
|
||
|
True
|
||
|
>>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1}
|
||
|
True
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
:class:`GridSearchCV`:
|
||
|
Uses :class:`ParameterGrid` to perform a full parallelized parameter
|
||
|
search.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, param_grid):
|
||
|
if isinstance(param_grid, Mapping):
|
||
|
# wrap dictionary in a singleton list to support either dict
|
||
|
# or list of dicts
|
||
|
param_grid = [param_grid]
|
||
|
self.param_grid = param_grid
|
||
|
|
||
|
def __iter__(self):
|
||
|
"""Iterate over the points in the grid.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
params : iterator over dict of string to any
|
||
|
Yields dictionaries mapping each estimator parameter to one of its
|
||
|
allowed values.
|
||
|
"""
|
||
|
for p in self.param_grid:
|
||
|
# Always sort the keys of a dictionary, for reproducibility
|
||
|
items = sorted(p.items())
|
||
|
if not items:
|
||
|
yield {}
|
||
|
else:
|
||
|
keys, values = zip(*items)
|
||
|
for v in product(*values):
|
||
|
params = dict(zip(keys, v))
|
||
|
yield params
|
||
|
|
||
|
def __len__(self):
|
||
|
"""Number of points on the grid."""
|
||
|
# Product function that can handle iterables (np.product can't).
|
||
|
product = partial(reduce, operator.mul)
|
||
|
return sum(product(len(v) for v in p.values()) if p else 1
|
||
|
for p in self.param_grid)
|
||
|
|
||
|
def __getitem__(self, ind):
|
||
|
"""Get the parameters that would be ``ind``th in iteration
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
ind : int
|
||
|
The iteration index
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
params : dict of string to any
|
||
|
Equal to list(self)[ind]
|
||
|
"""
|
||
|
# This is used to make discrete sampling without replacement memory
|
||
|
# efficient.
|
||
|
for sub_grid in self.param_grid:
|
||
|
# XXX: could memoize information used here
|
||
|
if not sub_grid:
|
||
|
if ind == 0:
|
||
|
return {}
|
||
|
else:
|
||
|
ind -= 1
|
||
|
continue
|
||
|
|
||
|
# Reverse so most frequent cycling parameter comes first
|
||
|
keys, values_lists = zip(*sorted(sub_grid.items())[::-1])
|
||
|
sizes = [len(v_list) for v_list in values_lists]
|
||
|
total = np.product(sizes)
|
||
|
|
||
|
if ind >= total:
|
||
|
# Try the next grid
|
||
|
ind -= total
|
||
|
else:
|
||
|
out = {}
|
||
|
for key, v_list, n in zip(keys, values_lists, sizes):
|
||
|
ind, offset = divmod(ind, n)
|
||
|
out[key] = v_list[offset]
|
||
|
return out
|
||
|
|
||
|
raise IndexError('ParameterGrid index out of range')
|
||
|
|
||
|
|
||
|
class ParameterSampler(object):
|
||
|
"""Generator on parameters sampled from given distributions.
|
||
|
|
||
|
Non-deterministic iterable over random candidate combinations for hyper-
|
||
|
parameter search. If all parameters are presented as a list,
|
||
|
sampling without replacement is performed. If at least one parameter
|
||
|
is given as a distribution, sampling with replacement is used.
|
||
|
It is highly recommended to use continuous distributions for continuous
|
||
|
parameters.
|
||
|
|
||
|
Note that before SciPy 0.16, the ``scipy.stats.distributions`` do not
|
||
|
accept a custom RNG instance and always use the singleton RNG from
|
||
|
``numpy.random``. Hence setting ``random_state`` will not guarantee a
|
||
|
deterministic iteration whenever ``scipy.stats`` distributions are used to
|
||
|
define the parameter search space. Deterministic behavior is however
|
||
|
guaranteed from SciPy 0.16 onwards.
|
||
|
|
||
|
Read more in the :ref:`User Guide <search>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
param_distributions : dict
|
||
|
Dictionary where the keys are parameters and values
|
||
|
are distributions from which a parameter is to be sampled.
|
||
|
Distributions either have to provide a ``rvs`` function
|
||
|
to sample from them, or can be given as a list of values,
|
||
|
where a uniform distribution is assumed.
|
||
|
|
||
|
n_iter : integer
|
||
|
Number of parameter settings that are produced.
|
||
|
|
||
|
random_state : int, RandomState instance or None, optional (default=None)
|
||
|
Pseudo random number generator state used for random uniform sampling
|
||
|
from lists of possible values instead of scipy.stats distributions.
|
||
|
If int, random_state is the seed used by the random number generator;
|
||
|
If RandomState instance, random_state is the random number generator;
|
||
|
If None, the random number generator is the RandomState instance used
|
||
|
by `np.random`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
params : dict of string to any
|
||
|
**Yields** dictionaries mapping each estimator parameter to
|
||
|
as sampled value.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.model_selection import ParameterSampler
|
||
|
>>> from scipy.stats.distributions import expon
|
||
|
>>> import numpy as np
|
||
|
>>> np.random.seed(0)
|
||
|
>>> param_grid = {'a':[1, 2], 'b': expon()}
|
||
|
>>> param_list = list(ParameterSampler(param_grid, n_iter=4))
|
||
|
>>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items())
|
||
|
... for d in param_list]
|
||
|
>>> rounded_list == [{'b': 0.89856, 'a': 1},
|
||
|
... {'b': 0.923223, 'a': 1},
|
||
|
... {'b': 1.878964, 'a': 2},
|
||
|
... {'b': 1.038159, 'a': 2}]
|
||
|
True
|
||
|
"""
|
||
|
def __init__(self, param_distributions, n_iter, random_state=None):
|
||
|
self.param_distributions = param_distributions
|
||
|
self.n_iter = n_iter
|
||
|
self.random_state = random_state
|
||
|
|
||
|
def __iter__(self):
|
||
|
# check if all distributions are given as lists
|
||
|
# in this case we want to sample without replacement
|
||
|
all_lists = np.all([not hasattr(v, "rvs")
|
||
|
for v in self.param_distributions.values()])
|
||
|
rnd = check_random_state(self.random_state)
|
||
|
|
||
|
if all_lists:
|
||
|
# look up sampled parameter settings in parameter grid
|
||
|
param_grid = ParameterGrid(self.param_distributions)
|
||
|
grid_size = len(param_grid)
|
||
|
|
||
|
if grid_size < self.n_iter:
|
||
|
raise ValueError(
|
||
|
"The total space of parameters %d is smaller "
|
||
|
"than n_iter=%d. For exhaustive searches, use "
|
||
|
"GridSearchCV." % (grid_size, self.n_iter))
|
||
|
for i in sample_without_replacement(grid_size, self.n_iter,
|
||
|
random_state=rnd):
|
||
|
yield param_grid[i]
|
||
|
|
||
|
else:
|
||
|
# Always sort the keys of a dictionary, for reproducibility
|
||
|
items = sorted(self.param_distributions.items())
|
||
|
for _ in six.moves.range(self.n_iter):
|
||
|
params = dict()
|
||
|
for k, v in items:
|
||
|
if hasattr(v, "rvs"):
|
||
|
if sp_version < (0, 16):
|
||
|
params[k] = v.rvs()
|
||
|
else:
|
||
|
params[k] = v.rvs(random_state=rnd)
|
||
|
else:
|
||
|
params[k] = v[rnd.randint(len(v))]
|
||
|
yield params
|
||
|
|
||
|
def __len__(self):
|
||
|
"""Number of points that will be sampled."""
|
||
|
return self.n_iter
|
||
|
|
||
|
|
||
|
def fit_grid_point(X, y, estimator, parameters, train, test, scorer,
|
||
|
verbose, error_score='raise', **fit_params):
|
||
|
"""Run fit on one set of parameters.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like, sparse matrix or list
|
||
|
Input data.
|
||
|
|
||
|
y : array-like or None
|
||
|
Targets for input data.
|
||
|
|
||
|
estimator : estimator object
|
||
|
A object of that type is instantiated for each grid point.
|
||
|
This is assumed to implement the scikit-learn estimator interface.
|
||
|
Either estimator needs to provide a ``score`` function,
|
||
|
or ``scoring`` must be passed.
|
||
|
|
||
|
parameters : dict
|
||
|
Parameters to be set on estimator for this grid point.
|
||
|
|
||
|
train : ndarray, dtype int or bool
|
||
|
Boolean mask or indices for training set.
|
||
|
|
||
|
test : ndarray, dtype int or bool
|
||
|
Boolean mask or indices for test set.
|
||
|
|
||
|
scorer : callable or None
|
||
|
The scorer callable object / function must have its signature as
|
||
|
``scorer(estimator, X, y)``.
|
||
|
|
||
|
If ``None`` the estimator's default scorer is used.
|
||
|
|
||
|
verbose : int
|
||
|
Verbosity level.
|
||
|
|
||
|
**fit_params : kwargs
|
||
|
Additional parameter passed to the fit function of the estimator.
|
||
|
|
||
|
error_score : 'raise' (default) or numeric
|
||
|
Value to assign to the score if an error occurs in estimator fitting.
|
||
|
If set to 'raise', the error is raised. If a numeric value is given,
|
||
|
FitFailedWarning is raised. This parameter does not affect the refit
|
||
|
step, which will always raise the error.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
score : float
|
||
|
Score of this parameter setting on given training / test split.
|
||
|
|
||
|
parameters : dict
|
||
|
The parameters that have been evaluated.
|
||
|
|
||
|
n_samples_test : int
|
||
|
Number of test samples in this split.
|
||
|
"""
|
||
|
# NOTE we are not using the return value as the scorer by itself should be
|
||
|
# validated before. We use check_scoring only to reject multimetric scorer
|
||
|
check_scoring(estimator, scorer)
|
||
|
scores, n_samples_test = _fit_and_score(estimator, X, y,
|
||
|
scorer, train,
|
||
|
test, verbose, parameters,
|
||
|
fit_params=fit_params,
|
||
|
return_n_test_samples=True,
|
||
|
error_score=error_score)
|
||
|
return scores, parameters, n_samples_test
|
||
|
|
||
|
|
||
|
def _check_param_grid(param_grid):
|
||
|
if hasattr(param_grid, 'items'):
|
||
|
param_grid = [param_grid]
|
||
|
|
||
|
for p in param_grid:
|
||
|
for name, v in p.items():
|
||
|
if isinstance(v, np.ndarray) and v.ndim > 1:
|
||
|
raise ValueError("Parameter array should be one-dimensional.")
|
||
|
|
||
|
if (isinstance(v, six.string_types) or
|
||
|
not isinstance(v, (np.ndarray, Sequence))):
|
||
|
raise ValueError("Parameter values for parameter ({0}) need "
|
||
|
"to be a sequence(but not a string) or"
|
||
|
" np.ndarray.".format(name))
|
||
|
|
||
|
if len(v) == 0:
|
||
|
raise ValueError("Parameter values for parameter ({0}) need "
|
||
|
"to be a non-empty sequence.".format(name))
|
||
|
|
||
|
|
||
|
# XXX Remove in 0.20
|
||
|
class _CVScoreTuple (namedtuple('_CVScoreTuple',
|
||
|
('parameters',
|
||
|
'mean_validation_score',
|
||
|
'cv_validation_scores'))):
|
||
|
# A raw namedtuple is very memory efficient as it packs the attributes
|
||
|
# in a struct to get rid of the __dict__ of attributes in particular it
|
||
|
# does not copy the string for the keys on each instance.
|
||
|
# By deriving a namedtuple class just to introduce the __repr__ method we
|
||
|
# would also reintroduce the __dict__ on the instance. By telling the
|
||
|
# Python interpreter that this subclass uses static __slots__ instead of
|
||
|
# dynamic attributes. Furthermore we don't need any additional slot in the
|
||
|
# subclass so we set __slots__ to the empty tuple.
|
||
|
__slots__ = ()
|
||
|
|
||
|
def __repr__(self):
|
||
|
"""Simple custom repr to summarize the main info"""
|
||
|
return "mean: {0:.5f}, std: {1:.5f}, params: {2}".format(
|
||
|
self.mean_validation_score,
|
||
|
np.std(self.cv_validation_scores),
|
||
|
self.parameters)
|
||
|
|
||
|
|
||
|
class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator,
|
||
|
MetaEstimatorMixin)):
|
||
|
"""Base class for hyper parameter search with cross-validation."""
|
||
|
|
||
|
@abstractmethod
|
||
|
def __init__(self, estimator, scoring=None,
|
||
|
fit_params=None, n_jobs=1, iid=True,
|
||
|
refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
|
||
|
error_score='raise', return_train_score=True):
|
||
|
|
||
|
self.scoring = scoring
|
||
|
self.estimator = estimator
|
||
|
self.n_jobs = n_jobs
|
||
|
self.fit_params = fit_params
|
||
|
self.iid = iid
|
||
|
self.refit = refit
|
||
|
self.cv = cv
|
||
|
self.verbose = verbose
|
||
|
self.pre_dispatch = pre_dispatch
|
||
|
self.error_score = error_score
|
||
|
self.return_train_score = return_train_score
|
||
|
|
||
|
@property
|
||
|
def _estimator_type(self):
|
||
|
return self.estimator._estimator_type
|
||
|
|
||
|
def score(self, X, y=None):
|
||
|
"""Returns the score on the given data, if the estimator has been refit.
|
||
|
|
||
|
This uses the score defined by ``scoring`` where provided, and the
|
||
|
``best_estimator_.score`` method otherwise.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like, shape = [n_samples, n_features]
|
||
|
Input data, where n_samples is the number of samples and
|
||
|
n_features is the number of features.
|
||
|
|
||
|
y : array-like, shape = [n_samples] or [n_samples, n_output], optional
|
||
|
Target relative to X for classification or regression;
|
||
|
None for unsupervised learning.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
score : float
|
||
|
"""
|
||
|
self._check_is_fitted('score')
|
||
|
if self.scorer_ is None:
|
||
|
raise ValueError("No score function explicitly defined, "
|
||
|
"and the estimator doesn't provide one %s"
|
||
|
% self.best_estimator_)
|
||
|
score = self.scorer_[self.refit] if self.multimetric_ else self.scorer_
|
||
|
return score(self.best_estimator_, X, y)
|
||
|
|
||
|
def _check_is_fitted(self, method_name):
|
||
|
if not self.refit:
|
||
|
raise NotFittedError('This %s instance was initialized '
|
||
|
'with refit=False. %s is '
|
||
|
'available only after refitting on the best '
|
||
|
'parameters. You can refit an estimator '
|
||
|
'manually using the ``best_parameters_`` '
|
||
|
'attribute'
|
||
|
% (type(self).__name__, method_name))
|
||
|
else:
|
||
|
check_is_fitted(self, 'best_estimator_')
|
||
|
|
||
|
@if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
|
||
|
def predict(self, X):
|
||
|
"""Call predict on the estimator with the best found parameters.
|
||
|
|
||
|
Only available if ``refit=True`` and the underlying estimator supports
|
||
|
``predict``.
|
||
|
|
||
|
Parameters
|
||
|
-----------
|
||
|
X : indexable, length n_samples
|
||
|
Must fulfill the input assumptions of the
|
||
|
underlying estimator.
|
||
|
|
||
|
"""
|
||
|
self._check_is_fitted('predict')
|
||
|
return self.best_estimator_.predict(X)
|
||
|
|
||
|
@if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
|
||
|
def predict_proba(self, X):
|
||
|
"""Call predict_proba on the estimator with the best found parameters.
|
||
|
|
||
|
Only available if ``refit=True`` and the underlying estimator supports
|
||
|
``predict_proba``.
|
||
|
|
||
|
Parameters
|
||
|
-----------
|
||
|
X : indexable, length n_samples
|
||
|
Must fulfill the input assumptions of the
|
||
|
underlying estimator.
|
||
|
|
||
|
"""
|
||
|
self._check_is_fitted('predict_proba')
|
||
|
return self.best_estimator_.predict_proba(X)
|
||
|
|
||
|
@if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
|
||
|
def predict_log_proba(self, X):
|
||
|
"""Call predict_log_proba on the estimator with the best found parameters.
|
||
|
|
||
|
Only available if ``refit=True`` and the underlying estimator supports
|
||
|
``predict_log_proba``.
|
||
|
|
||
|
Parameters
|
||
|
-----------
|
||
|
X : indexable, length n_samples
|
||
|
Must fulfill the input assumptions of the
|
||
|
underlying estimator.
|
||
|
|
||
|
"""
|
||
|
self._check_is_fitted('predict_log_proba')
|
||
|
return self.best_estimator_.predict_log_proba(X)
|
||
|
|
||
|
@if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
|
||
|
def decision_function(self, X):
|
||
|
"""Call decision_function on the estimator with the best found parameters.
|
||
|
|
||
|
Only available if ``refit=True`` and the underlying estimator supports
|
||
|
``decision_function``.
|
||
|
|
||
|
Parameters
|
||
|
-----------
|
||
|
X : indexable, length n_samples
|
||
|
Must fulfill the input assumptions of the
|
||
|
underlying estimator.
|
||
|
|
||
|
"""
|
||
|
self._check_is_fitted('decision_function')
|
||
|
return self.best_estimator_.decision_function(X)
|
||
|
|
||
|
@if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
|
||
|
def transform(self, X):
|
||
|
"""Call transform on the estimator with the best found parameters.
|
||
|
|
||
|
Only available if the underlying estimator supports ``transform`` and
|
||
|
``refit=True``.
|
||
|
|
||
|
Parameters
|
||
|
-----------
|
||
|
X : indexable, length n_samples
|
||
|
Must fulfill the input assumptions of the
|
||
|
underlying estimator.
|
||
|
|
||
|
"""
|
||
|
self._check_is_fitted('transform')
|
||
|
return self.best_estimator_.transform(X)
|
||
|
|
||
|
@if_delegate_has_method(delegate=('best_estimator_', 'estimator'))
|
||
|
def inverse_transform(self, Xt):
|
||
|
"""Call inverse_transform on the estimator with the best found params.
|
||
|
|
||
|
Only available if the underlying estimator implements
|
||
|
``inverse_transform`` and ``refit=True``.
|
||
|
|
||
|
Parameters
|
||
|
-----------
|
||
|
Xt : indexable, length n_samples
|
||
|
Must fulfill the input assumptions of the
|
||
|
underlying estimator.
|
||
|
|
||
|
"""
|
||
|
self._check_is_fitted('inverse_transform')
|
||
|
return self.best_estimator_.inverse_transform(Xt)
|
||
|
|
||
|
@property
|
||
|
def classes_(self):
|
||
|
self._check_is_fitted("classes_")
|
||
|
return self.best_estimator_.classes_
|
||
|
|
||
|
def fit(self, X, y=None, groups=None, **fit_params):
|
||
|
"""Run fit with all sets of parameters.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
|
||
|
X : array-like, shape = [n_samples, n_features]
|
||
|
Training vector, where n_samples is the number of samples and
|
||
|
n_features is the number of features.
|
||
|
|
||
|
y : array-like, shape = [n_samples] or [n_samples, n_output], optional
|
||
|
Target relative to X for classification or regression;
|
||
|
None for unsupervised learning.
|
||
|
|
||
|
groups : array-like, with shape (n_samples,), optional
|
||
|
Group labels for the samples used while splitting the dataset into
|
||
|
train/test set.
|
||
|
|
||
|
**fit_params : dict of string -> object
|
||
|
Parameters passed to the ``fit`` method of the estimator
|
||
|
"""
|
||
|
if self.fit_params is not None:
|
||
|
warnings.warn('"fit_params" as a constructor argument was '
|
||
|
'deprecated in version 0.19 and will be removed '
|
||
|
'in version 0.21. Pass fit parameters to the '
|
||
|
'"fit" method instead.', DeprecationWarning)
|
||
|
if fit_params:
|
||
|
warnings.warn('Ignoring fit_params passed as a constructor '
|
||
|
'argument in favor of keyword arguments to '
|
||
|
'the "fit" method.', RuntimeWarning)
|
||
|
else:
|
||
|
fit_params = self.fit_params
|
||
|
estimator = self.estimator
|
||
|
cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
|
||
|
|
||
|
scorers, self.multimetric_ = _check_multimetric_scoring(
|
||
|
self.estimator, scoring=self.scoring)
|
||
|
|
||
|
if self.multimetric_:
|
||
|
if self.refit is not False and (
|
||
|
not isinstance(self.refit, six.string_types) or
|
||
|
# This will work for both dict / list (tuple)
|
||
|
self.refit not in scorers):
|
||
|
raise ValueError("For multi-metric scoring, the parameter "
|
||
|
"refit must be set to a scorer key "
|
||
|
"to refit an estimator with the best "
|
||
|
"parameter setting on the whole data and "
|
||
|
"make the best_* attributes "
|
||
|
"available for that metric. If this is not "
|
||
|
"needed, refit should be set to False "
|
||
|
"explicitly. %r was passed." % self.refit)
|
||
|
else:
|
||
|
refit_metric = self.refit
|
||
|
else:
|
||
|
refit_metric = 'score'
|
||
|
|
||
|
X, y, groups = indexable(X, y, groups)
|
||
|
n_splits = cv.get_n_splits(X, y, groups)
|
||
|
# Regenerate parameter iterable for each fit
|
||
|
candidate_params = list(self._get_param_iterator())
|
||
|
n_candidates = len(candidate_params)
|
||
|
if self.verbose > 0:
|
||
|
print("Fitting {0} folds for each of {1} candidates, totalling"
|
||
|
" {2} fits".format(n_splits, n_candidates,
|
||
|
n_candidates * n_splits))
|
||
|
|
||
|
base_estimator = clone(self.estimator)
|
||
|
pre_dispatch = self.pre_dispatch
|
||
|
|
||
|
out = Parallel(
|
||
|
n_jobs=self.n_jobs, verbose=self.verbose,
|
||
|
pre_dispatch=pre_dispatch
|
||
|
)(delayed(_fit_and_score)(clone(base_estimator), X, y, scorers, train,
|
||
|
test, self.verbose, parameters,
|
||
|
fit_params=fit_params,
|
||
|
return_train_score=self.return_train_score,
|
||
|
return_n_test_samples=True,
|
||
|
return_times=True, return_parameters=False,
|
||
|
error_score=self.error_score)
|
||
|
for parameters, (train, test) in product(candidate_params,
|
||
|
cv.split(X, y, groups)))
|
||
|
|
||
|
# if one choose to see train score, "out" will contain train score info
|
||
|
if self.return_train_score:
|
||
|
(train_score_dicts, test_score_dicts, test_sample_counts, fit_time,
|
||
|
score_time) = zip(*out)
|
||
|
else:
|
||
|
(test_score_dicts, test_sample_counts, fit_time,
|
||
|
score_time) = zip(*out)
|
||
|
|
||
|
# test_score_dicts and train_score dicts are lists of dictionaries and
|
||
|
# we make them into dict of lists
|
||
|
test_scores = _aggregate_score_dicts(test_score_dicts)
|
||
|
if self.return_train_score:
|
||
|
train_scores = _aggregate_score_dicts(train_score_dicts)
|
||
|
|
||
|
# TODO: replace by a dict in 0.21
|
||
|
results = (DeprecationDict() if self.return_train_score == 'warn'
|
||
|
else {})
|
||
|
|
||
|
def _store(key_name, array, weights=None, splits=False, rank=False):
|
||
|
"""A small helper to store the scores/times to the cv_results_"""
|
||
|
# When iterated first by splits, then by parameters
|
||
|
# We want `array` to have `n_candidates` rows and `n_splits` cols.
|
||
|
array = np.array(array, dtype=np.float64).reshape(n_candidates,
|
||
|
n_splits)
|
||
|
if splits:
|
||
|
for split_i in range(n_splits):
|
||
|
# Uses closure to alter the results
|
||
|
results["split%d_%s"
|
||
|
% (split_i, key_name)] = array[:, split_i]
|
||
|
|
||
|
array_means = np.average(array, axis=1, weights=weights)
|
||
|
results['mean_%s' % key_name] = array_means
|
||
|
# Weighted std is not directly available in numpy
|
||
|
array_stds = np.sqrt(np.average((array -
|
||
|
array_means[:, np.newaxis]) ** 2,
|
||
|
axis=1, weights=weights))
|
||
|
results['std_%s' % key_name] = array_stds
|
||
|
|
||
|
if rank:
|
||
|
results["rank_%s" % key_name] = np.asarray(
|
||
|
rankdata(-array_means, method='min'), dtype=np.int32)
|
||
|
|
||
|
_store('fit_time', fit_time)
|
||
|
_store('score_time', score_time)
|
||
|
# Use one MaskedArray and mask all the places where the param is not
|
||
|
# applicable for that candidate. Use defaultdict as each candidate may
|
||
|
# not contain all the params
|
||
|
param_results = defaultdict(partial(MaskedArray,
|
||
|
np.empty(n_candidates,),
|
||
|
mask=True,
|
||
|
dtype=object))
|
||
|
for cand_i, params in enumerate(candidate_params):
|
||
|
for name, value in params.items():
|
||
|
# An all masked empty array gets created for the key
|
||
|
# `"param_%s" % name` at the first occurence of `name`.
|
||
|
# Setting the value at an index also unmasks that index
|
||
|
param_results["param_%s" % name][cand_i] = value
|
||
|
|
||
|
results.update(param_results)
|
||
|
# Store a list of param dicts at the key 'params'
|
||
|
results['params'] = candidate_params
|
||
|
|
||
|
# NOTE test_sample counts (weights) remain the same for all candidates
|
||
|
test_sample_counts = np.array(test_sample_counts[:n_splits],
|
||
|
dtype=np.int)
|
||
|
for scorer_name in scorers.keys():
|
||
|
# Computed the (weighted) mean and std for test scores alone
|
||
|
_store('test_%s' % scorer_name, test_scores[scorer_name],
|
||
|
splits=True, rank=True,
|
||
|
weights=test_sample_counts if self.iid else None)
|
||
|
if self.return_train_score:
|
||
|
prev_keys = set(results.keys())
|
||
|
_store('train_%s' % scorer_name, train_scores[scorer_name],
|
||
|
splits=True)
|
||
|
|
||
|
if self.return_train_score == 'warn':
|
||
|
for key in set(results.keys()) - prev_keys:
|
||
|
message = (
|
||
|
'You are accessing a training score ({!r}), '
|
||
|
'which will not be available by default '
|
||
|
'any more in 0.21. If you need training scores, '
|
||
|
'please set return_train_score=True').format(key)
|
||
|
# warn on key access
|
||
|
results.add_warning(key, message, FutureWarning)
|
||
|
|
||
|
# For multi-metric evaluation, store the best_index_, best_params_ and
|
||
|
# best_score_ iff refit is one of the scorer names
|
||
|
# In single metric evaluation, refit_metric is "score"
|
||
|
if self.refit or not self.multimetric_:
|
||
|
self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
|
||
|
self.best_params_ = candidate_params[self.best_index_]
|
||
|
self.best_score_ = results["mean_test_%s" % refit_metric][
|
||
|
self.best_index_]
|
||
|
|
||
|
if self.refit:
|
||
|
self.best_estimator_ = clone(base_estimator).set_params(
|
||
|
**self.best_params_)
|
||
|
if y is not None:
|
||
|
self.best_estimator_.fit(X, y, **fit_params)
|
||
|
else:
|
||
|
self.best_estimator_.fit(X, **fit_params)
|
||
|
|
||
|
# Store the only scorer not as a dict for single metric evaluation
|
||
|
self.scorer_ = scorers if self.multimetric_ else scorers['score']
|
||
|
|
||
|
self.cv_results_ = results
|
||
|
self.n_splits_ = n_splits
|
||
|
|
||
|
return self
|
||
|
|
||
|
@property
|
||
|
def grid_scores_(self):
|
||
|
check_is_fitted(self, 'cv_results_')
|
||
|
if self.multimetric_:
|
||
|
raise AttributeError("grid_scores_ attribute is not available for"
|
||
|
" multi-metric evaluation.")
|
||
|
warnings.warn(
|
||
|
"The grid_scores_ attribute was deprecated in version 0.18"
|
||
|
" in favor of the more elaborate cv_results_ attribute."
|
||
|
" The grid_scores_ attribute will not be available from 0.20",
|
||
|
DeprecationWarning)
|
||
|
|
||
|
grid_scores = list()
|
||
|
|
||
|
for i, (params, mean, std) in enumerate(zip(
|
||
|
self.cv_results_['params'],
|
||
|
self.cv_results_['mean_test_score'],
|
||
|
self.cv_results_['std_test_score'])):
|
||
|
scores = np.array(list(self.cv_results_['split%d_test_score'
|
||
|
% s][i]
|
||
|
for s in range(self.n_splits_)),
|
||
|
dtype=np.float64)
|
||
|
grid_scores.append(_CVScoreTuple(params, mean, scores))
|
||
|
|
||
|
return grid_scores
|
||
|
|
||
|
|
||
|
class GridSearchCV(BaseSearchCV):
|
||
|
"""Exhaustive search over specified parameter values for an estimator.
|
||
|
|
||
|
Important members are fit, predict.
|
||
|
|
||
|
GridSearchCV implements a "fit" and a "score" method.
|
||
|
It also implements "predict", "predict_proba", "decision_function",
|
||
|
"transform" and "inverse_transform" if they are implemented in the
|
||
|
estimator used.
|
||
|
|
||
|
The parameters of the estimator used to apply these methods are optimized
|
||
|
by cross-validated grid-search over a parameter grid.
|
||
|
|
||
|
Read more in the :ref:`User Guide <grid_search>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
estimator : estimator object.
|
||
|
This is assumed to implement the scikit-learn estimator interface.
|
||
|
Either estimator needs to provide a ``score`` function,
|
||
|
or ``scoring`` must be passed.
|
||
|
|
||
|
param_grid : dict or list of dictionaries
|
||
|
Dictionary with parameters names (string) as keys and lists of
|
||
|
parameter settings to try as values, or a list of such
|
||
|
dictionaries, in which case the grids spanned by each dictionary
|
||
|
in the list are explored. This enables searching over any sequence
|
||
|
of parameter settings.
|
||
|
|
||
|
scoring : string, callable, list/tuple, dict or None, default: None
|
||
|
A single string (see :ref:`scoring_parameter`) or a callable
|
||
|
(see :ref:`scoring`) to evaluate the predictions on the test set.
|
||
|
|
||
|
For evaluating multiple metrics, either give a list of (unique) strings
|
||
|
or a dict with names as keys and callables as values.
|
||
|
|
||
|
NOTE that when using custom scorers, each scorer should return a single
|
||
|
value. Metric functions returning a list/array of values can be wrapped
|
||
|
into multiple scorers that return one value each.
|
||
|
|
||
|
See :ref:`multimetric_grid_search` for an example.
|
||
|
|
||
|
If None, the estimator's default scorer (if available) is used.
|
||
|
|
||
|
fit_params : dict, optional
|
||
|
Parameters to pass to the fit method.
|
||
|
|
||
|
.. deprecated:: 0.19
|
||
|
``fit_params`` as a constructor argument was deprecated in version
|
||
|
0.19 and will be removed in version 0.21. Pass fit parameters to
|
||
|
the ``fit`` method instead.
|
||
|
|
||
|
n_jobs : int, default=1
|
||
|
Number of jobs to run in parallel.
|
||
|
|
||
|
pre_dispatch : int, or string, optional
|
||
|
Controls the number of jobs that get dispatched during parallel
|
||
|
execution. Reducing this number can be useful to avoid an
|
||
|
explosion of memory consumption when more jobs get dispatched
|
||
|
than CPUs can process. This parameter can be:
|
||
|
|
||
|
- None, in which case all the jobs are immediately
|
||
|
created and spawned. Use this for lightweight and
|
||
|
fast-running jobs, to avoid delays due to on-demand
|
||
|
spawning of the jobs
|
||
|
|
||
|
- An int, giving the exact number of total jobs that are
|
||
|
spawned
|
||
|
|
||
|
- A string, giving an expression as a function of n_jobs,
|
||
|
as in '2*n_jobs'
|
||
|
|
||
|
iid : boolean, default=True
|
||
|
If True, the data is assumed to be identically distributed across
|
||
|
the folds, and the loss minimized is the total loss per sample,
|
||
|
and not the mean loss across the folds.
|
||
|
|
||
|
cv : int, cross-validation generator or an iterable, optional
|
||
|
Determines the cross-validation splitting strategy.
|
||
|
Possible inputs for cv are:
|
||
|
- None, to use the default 3-fold cross validation,
|
||
|
- integer, to specify the number of folds in a `(Stratified)KFold`,
|
||
|
- An object to be used as a cross-validation generator.
|
||
|
- An iterable yielding train, test splits.
|
||
|
|
||
|
For integer/None inputs, if the estimator is a classifier and ``y`` is
|
||
|
either binary or multiclass, :class:`StratifiedKFold` is used. In all
|
||
|
other cases, :class:`KFold` is used.
|
||
|
|
||
|
Refer :ref:`User Guide <cross_validation>` for the various
|
||
|
cross-validation strategies that can be used here.
|
||
|
|
||
|
refit : boolean, or string, default=True
|
||
|
Refit an estimator using the best found parameters on the whole
|
||
|
dataset.
|
||
|
|
||
|
For multiple metric evaluation, this needs to be a string denoting the
|
||
|
scorer is used to find the best parameters for refitting the estimator
|
||
|
at the end.
|
||
|
|
||
|
The refitted estimator is made available at the ``best_estimator_``
|
||
|
attribute and permits using ``predict`` directly on this
|
||
|
``GridSearchCV`` instance.
|
||
|
|
||
|
Also for multiple metric evaluation, the attributes ``best_index_``,
|
||
|
``best_score_`` and ``best_parameters_`` will only be available if
|
||
|
``refit`` is set and all of them will be determined w.r.t this specific
|
||
|
scorer.
|
||
|
|
||
|
See ``scoring`` parameter to know more about multiple metric
|
||
|
evaluation.
|
||
|
|
||
|
verbose : integer
|
||
|
Controls the verbosity: the higher, the more messages.
|
||
|
|
||
|
error_score : 'raise' (default) or numeric
|
||
|
Value to assign to the score if an error occurs in estimator fitting.
|
||
|
If set to 'raise', the error is raised. If a numeric value is given,
|
||
|
FitFailedWarning is raised. This parameter does not affect the refit
|
||
|
step, which will always raise the error.
|
||
|
|
||
|
return_train_score : boolean, optional
|
||
|
If ``False``, the ``cv_results_`` attribute will not include training
|
||
|
scores.
|
||
|
|
||
|
Current default is ``'warn'``, which behaves as ``True`` in addition
|
||
|
to raising a warning when a training score is looked up.
|
||
|
That default will be changed to ``False`` in 0.21.
|
||
|
Computing training scores is used to get insights on how different
|
||
|
parameter settings impact the overfitting/underfitting trade-off.
|
||
|
However computing the scores on the training set can be computationally
|
||
|
expensive and is not strictly required to select the parameters that
|
||
|
yield the best generalization performance.
|
||
|
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn import svm, datasets
|
||
|
>>> from sklearn.model_selection import GridSearchCV
|
||
|
>>> iris = datasets.load_iris()
|
||
|
>>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
|
||
|
>>> svc = svm.SVC()
|
||
|
>>> clf = GridSearchCV(svc, parameters)
|
||
|
>>> clf.fit(iris.data, iris.target)
|
||
|
... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
|
||
|
GridSearchCV(cv=None, error_score=...,
|
||
|
estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=...,
|
||
|
decision_function_shape='ovr', degree=..., gamma=...,
|
||
|
kernel='rbf', max_iter=-1, probability=False,
|
||
|
random_state=None, shrinking=True, tol=...,
|
||
|
verbose=False),
|
||
|
fit_params=None, iid=..., n_jobs=1,
|
||
|
param_grid=..., pre_dispatch=..., refit=..., return_train_score=...,
|
||
|
scoring=..., verbose=...)
|
||
|
>>> sorted(clf.cv_results_.keys())
|
||
|
... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
|
||
|
['mean_fit_time', 'mean_score_time', 'mean_test_score',...
|
||
|
'mean_train_score', 'param_C', 'param_kernel', 'params',...
|
||
|
'rank_test_score', 'split0_test_score',...
|
||
|
'split0_train_score', 'split1_test_score', 'split1_train_score',...
|
||
|
'split2_test_score', 'split2_train_score',...
|
||
|
'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score'...]
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
cv_results_ : dict of numpy (masked) ndarrays
|
||
|
A dict with keys as column headers and values as columns, that can be
|
||
|
imported into a pandas ``DataFrame``.
|
||
|
|
||
|
For instance the below given table
|
||
|
|
||
|
+------------+-----------+------------+-----------------+---+---------+
|
||
|
|param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...|
|
||
|
+============+===========+============+=================+===+=========+
|
||
|
| 'poly' | -- | 2 | 0.8 |...| 2 |
|
||
|
+------------+-----------+------------+-----------------+---+---------+
|
||
|
| 'poly' | -- | 3 | 0.7 |...| 4 |
|
||
|
+------------+-----------+------------+-----------------+---+---------+
|
||
|
| 'rbf' | 0.1 | -- | 0.8 |...| 3 |
|
||
|
+------------+-----------+------------+-----------------+---+---------+
|
||
|
| 'rbf' | 0.2 | -- | 0.9 |...| 1 |
|
||
|
+------------+-----------+------------+-----------------+---+---------+
|
||
|
|
||
|
will be represented by a ``cv_results_`` dict of::
|
||
|
|
||
|
{
|
||
|
'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
|
||
|
mask = [False False False False]...)
|
||
|
'param_gamma': masked_array(data = [-- -- 0.1 0.2],
|
||
|
mask = [ True True False False]...),
|
||
|
'param_degree': masked_array(data = [2.0 3.0 -- --],
|
||
|
mask = [False False True True]...),
|
||
|
'split0_test_score' : [0.8, 0.7, 0.8, 0.9],
|
||
|
'split1_test_score' : [0.82, 0.5, 0.7, 0.78],
|
||
|
'mean_test_score' : [0.81, 0.60, 0.75, 0.82],
|
||
|
'std_test_score' : [0.02, 0.01, 0.03, 0.03],
|
||
|
'rank_test_score' : [2, 4, 3, 1],
|
||
|
'split0_train_score' : [0.8, 0.9, 0.7],
|
||
|
'split1_train_score' : [0.82, 0.5, 0.7],
|
||
|
'mean_train_score' : [0.81, 0.7, 0.7],
|
||
|
'std_train_score' : [0.03, 0.03, 0.04],
|
||
|
'mean_fit_time' : [0.73, 0.63, 0.43, 0.49],
|
||
|
'std_fit_time' : [0.01, 0.02, 0.01, 0.01],
|
||
|
'mean_score_time' : [0.007, 0.06, 0.04, 0.04],
|
||
|
'std_score_time' : [0.001, 0.002, 0.003, 0.005],
|
||
|
'params' : [{'kernel': 'poly', 'degree': 2}, ...],
|
||
|
}
|
||
|
|
||
|
NOTE
|
||
|
|
||
|
The key ``'params'`` is used to store a list of parameter
|
||
|
settings dicts for all the parameter candidates.
|
||
|
|
||
|
The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
|
||
|
``std_score_time`` are all in seconds.
|
||
|
|
||
|
For multi-metric evaluation, the scores for all the scorers are
|
||
|
available in the ``cv_results_`` dict at the keys ending with that
|
||
|
scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` shown
|
||
|
above. ('split0_test_precision', 'mean_train_precision' etc.)
|
||
|
|
||
|
best_estimator_ : estimator or dict
|
||
|
Estimator that was chosen by the search, i.e. estimator
|
||
|
which gave highest score (or smallest loss if specified)
|
||
|
on the left out data. Not available if ``refit=False``.
|
||
|
|
||
|
See ``refit`` parameter for more information on allowed values.
|
||
|
|
||
|
best_score_ : float
|
||
|
Mean cross-validated score of the best_estimator
|
||
|
|
||
|
For multi-metric evaluation, this is present only if ``refit`` is
|
||
|
specified.
|
||
|
|
||
|
best_params_ : dict
|
||
|
Parameter setting that gave the best results on the hold out data.
|
||
|
|
||
|
For multi-metric evaluation, this is present only if ``refit`` is
|
||
|
specified.
|
||
|
|
||
|
best_index_ : int
|
||
|
The index (of the ``cv_results_`` arrays) which corresponds to the best
|
||
|
candidate parameter setting.
|
||
|
|
||
|
The dict at ``search.cv_results_['params'][search.best_index_]`` gives
|
||
|
the parameter setting for the best model, that gives the highest
|
||
|
mean score (``search.best_score_``).
|
||
|
|
||
|
For multi-metric evaluation, this is present only if ``refit`` is
|
||
|
specified.
|
||
|
|
||
|
scorer_ : function or a dict
|
||
|
Scorer function used on the held out data to choose the best
|
||
|
parameters for the model.
|
||
|
|
||
|
For multi-metric evaluation, this attribute holds the validated
|
||
|
``scoring`` dict which maps the scorer key to the scorer callable.
|
||
|
|
||
|
n_splits_ : int
|
||
|
The number of cross-validation splits (folds/iterations).
|
||
|
|
||
|
Notes
|
||
|
------
|
||
|
The parameters selected are those that maximize the score of the left out
|
||
|
data, unless an explicit score is passed in which case it is used instead.
|
||
|
|
||
|
If `n_jobs` was set to a value higher than one, the data is copied for each
|
||
|
point in the grid (and not `n_jobs` times). This is done for efficiency
|
||
|
reasons if individual jobs take very little time, but may raise errors if
|
||
|
the dataset is large and not enough memory is available. A workaround in
|
||
|
this case is to set `pre_dispatch`. Then, the memory is copied only
|
||
|
`pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
|
||
|
n_jobs`.
|
||
|
|
||
|
See Also
|
||
|
---------
|
||
|
:class:`ParameterGrid`:
|
||
|
generates all the combinations of a hyperparameter grid.
|
||
|
|
||
|
:func:`sklearn.model_selection.train_test_split`:
|
||
|
utility function to split the data into a development set usable
|
||
|
for fitting a GridSearchCV instance and an evaluation set for
|
||
|
its final evaluation.
|
||
|
|
||
|
:func:`sklearn.metrics.make_scorer`:
|
||
|
Make a scorer from a performance metric or loss function.
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, estimator, param_grid, scoring=None, fit_params=None,
|
||
|
n_jobs=1, iid=True, refit=True, cv=None, verbose=0,
|
||
|
pre_dispatch='2*n_jobs', error_score='raise',
|
||
|
return_train_score="warn"):
|
||
|
super(GridSearchCV, self).__init__(
|
||
|
estimator=estimator, scoring=scoring, fit_params=fit_params,
|
||
|
n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
|
||
|
pre_dispatch=pre_dispatch, error_score=error_score,
|
||
|
return_train_score=return_train_score)
|
||
|
self.param_grid = param_grid
|
||
|
_check_param_grid(param_grid)
|
||
|
|
||
|
def _get_param_iterator(self):
|
||
|
"""Return ParameterGrid instance for the given param_grid"""
|
||
|
return ParameterGrid(self.param_grid)
|
||
|
|
||
|
|
||
|
class RandomizedSearchCV(BaseSearchCV):
|
||
|
"""Randomized search on hyper parameters.
|
||
|
|
||
|
RandomizedSearchCV implements a "fit" and a "score" method.
|
||
|
It also implements "predict", "predict_proba", "decision_function",
|
||
|
"transform" and "inverse_transform" if they are implemented in the
|
||
|
estimator used.
|
||
|
|
||
|
The parameters of the estimator used to apply these methods are optimized
|
||
|
by cross-validated search over parameter settings.
|
||
|
|
||
|
In contrast to GridSearchCV, not all parameter values are tried out, but
|
||
|
rather a fixed number of parameter settings is sampled from the specified
|
||
|
distributions. The number of parameter settings that are tried is
|
||
|
given by n_iter.
|
||
|
|
||
|
If all parameters are presented as a list,
|
||
|
sampling without replacement is performed. If at least one parameter
|
||
|
is given as a distribution, sampling with replacement is used.
|
||
|
It is highly recommended to use continuous distributions for continuous
|
||
|
parameters.
|
||
|
|
||
|
Read more in the :ref:`User Guide <randomized_parameter_search>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
estimator : estimator object.
|
||
|
A object of that type is instantiated for each grid point.
|
||
|
This is assumed to implement the scikit-learn estimator interface.
|
||
|
Either estimator needs to provide a ``score`` function,
|
||
|
or ``scoring`` must be passed.
|
||
|
|
||
|
param_distributions : dict
|
||
|
Dictionary with parameters names (string) as keys and distributions
|
||
|
or lists of parameters to try. Distributions must provide a ``rvs``
|
||
|
method for sampling (such as those from scipy.stats.distributions).
|
||
|
If a list is given, it is sampled uniformly.
|
||
|
|
||
|
n_iter : int, default=10
|
||
|
Number of parameter settings that are sampled. n_iter trades
|
||
|
off runtime vs quality of the solution.
|
||
|
|
||
|
scoring : string, callable, list/tuple, dict or None, default: None
|
||
|
A single string (see :ref:`scoring_parameter`) or a callable
|
||
|
(see :ref:`scoring`) to evaluate the predictions on the test set.
|
||
|
|
||
|
For evaluating multiple metrics, either give a list of (unique) strings
|
||
|
or a dict with names as keys and callables as values.
|
||
|
|
||
|
NOTE that when using custom scorers, each scorer should return a single
|
||
|
value. Metric functions returning a list/array of values can be wrapped
|
||
|
into multiple scorers that return one value each.
|
||
|
|
||
|
See :ref:`multimetric_grid_search` for an example.
|
||
|
|
||
|
If None, the estimator's default scorer (if available) is used.
|
||
|
|
||
|
fit_params : dict, optional
|
||
|
Parameters to pass to the fit method.
|
||
|
|
||
|
.. deprecated:: 0.19
|
||
|
``fit_params`` as a constructor argument was deprecated in version
|
||
|
0.19 and will be removed in version 0.21. Pass fit parameters to
|
||
|
the ``fit`` method instead.
|
||
|
|
||
|
n_jobs : int, default=1
|
||
|
Number of jobs to run in parallel.
|
||
|
|
||
|
pre_dispatch : int, or string, optional
|
||
|
Controls the number of jobs that get dispatched during parallel
|
||
|
execution. Reducing this number can be useful to avoid an
|
||
|
explosion of memory consumption when more jobs get dispatched
|
||
|
than CPUs can process. This parameter can be:
|
||
|
|
||
|
- None, in which case all the jobs are immediately
|
||
|
created and spawned. Use this for lightweight and
|
||
|
fast-running jobs, to avoid delays due to on-demand
|
||
|
spawning of the jobs
|
||
|
|
||
|
- An int, giving the exact number of total jobs that are
|
||
|
spawned
|
||
|
|
||
|
- A string, giving an expression as a function of n_jobs,
|
||
|
as in '2*n_jobs'
|
||
|
|
||
|
iid : boolean, default=True
|
||
|
If True, the data is assumed to be identically distributed across
|
||
|
the folds, and the loss minimized is the total loss per sample,
|
||
|
and not the mean loss across the folds.
|
||
|
|
||
|
cv : int, cross-validation generator or an iterable, optional
|
||
|
Determines the cross-validation splitting strategy.
|
||
|
Possible inputs for cv are:
|
||
|
- None, to use the default 3-fold cross validation,
|
||
|
- integer, to specify the number of folds in a `(Stratified)KFold`,
|
||
|
- An object to be used as a cross-validation generator.
|
||
|
- An iterable yielding train, test splits.
|
||
|
|
||
|
For integer/None inputs, if the estimator is a classifier and ``y`` is
|
||
|
either binary or multiclass, :class:`StratifiedKFold` is used. In all
|
||
|
other cases, :class:`KFold` is used.
|
||
|
|
||
|
Refer :ref:`User Guide <cross_validation>` for the various
|
||
|
cross-validation strategies that can be used here.
|
||
|
|
||
|
refit : boolean, or string default=True
|
||
|
Refit an estimator using the best found parameters on the whole
|
||
|
dataset.
|
||
|
|
||
|
For multiple metric evaluation, this needs to be a string denoting the
|
||
|
scorer that would be used to find the best parameters for refitting
|
||
|
the estimator at the end.
|
||
|
|
||
|
The refitted estimator is made available at the ``best_estimator_``
|
||
|
attribute and permits using ``predict`` directly on this
|
||
|
``RandomizedSearchCV`` instance.
|
||
|
|
||
|
Also for multiple metric evaluation, the attributes ``best_index_``,
|
||
|
``best_score_`` and ``best_parameters_`` will only be available if
|
||
|
``refit`` is set and all of them will be determined w.r.t this specific
|
||
|
scorer.
|
||
|
|
||
|
See ``scoring`` parameter to know more about multiple metric
|
||
|
evaluation.
|
||
|
|
||
|
verbose : integer
|
||
|
Controls the verbosity: the higher, the more messages.
|
||
|
|
||
|
random_state : int, RandomState instance or None, optional, default=None
|
||
|
Pseudo random number generator state used for random uniform sampling
|
||
|
from lists of possible values instead of scipy.stats distributions.
|
||
|
If int, random_state is the seed used by the random number generator;
|
||
|
If RandomState instance, random_state is the random number generator;
|
||
|
If None, the random number generator is the RandomState instance used
|
||
|
by `np.random`.
|
||
|
|
||
|
error_score : 'raise' (default) or numeric
|
||
|
Value to assign to the score if an error occurs in estimator fitting.
|
||
|
If set to 'raise', the error is raised. If a numeric value is given,
|
||
|
FitFailedWarning is raised. This parameter does not affect the refit
|
||
|
step, which will always raise the error.
|
||
|
|
||
|
return_train_score : boolean, optional
|
||
|
If ``False``, the ``cv_results_`` attribute will not include training
|
||
|
scores.
|
||
|
|
||
|
Current default is ``'warn'``, which behaves as ``True`` in addition
|
||
|
to raising a warning when a training score is looked up.
|
||
|
That default will be changed to ``False`` in 0.21.
|
||
|
Computing training scores is used to get insights on how different
|
||
|
parameter settings impact the overfitting/underfitting trade-off.
|
||
|
However computing the scores on the training set can be computationally
|
||
|
expensive and is not strictly required to select the parameters that
|
||
|
yield the best generalization performance.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
cv_results_ : dict of numpy (masked) ndarrays
|
||
|
A dict with keys as column headers and values as columns, that can be
|
||
|
imported into a pandas ``DataFrame``.
|
||
|
|
||
|
For instance the below given table
|
||
|
|
||
|
+--------------+-------------+-------------------+---+---------------+
|
||
|
| param_kernel | param_gamma | split0_test_score |...|rank_test_score|
|
||
|
+==============+=============+===================+===+===============+
|
||
|
| 'rbf' | 0.1 | 0.8 |...| 2 |
|
||
|
+--------------+-------------+-------------------+---+---------------+
|
||
|
| 'rbf' | 0.2 | 0.9 |...| 1 |
|
||
|
+--------------+-------------+-------------------+---+---------------+
|
||
|
| 'rbf' | 0.3 | 0.7 |...| 1 |
|
||
|
+--------------+-------------+-------------------+---+---------------+
|
||
|
|
||
|
will be represented by a ``cv_results_`` dict of::
|
||
|
|
||
|
{
|
||
|
'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'],
|
||
|
mask = False),
|
||
|
'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False),
|
||
|
'split0_test_score' : [0.8, 0.9, 0.7],
|
||
|
'split1_test_score' : [0.82, 0.5, 0.7],
|
||
|
'mean_test_score' : [0.81, 0.7, 0.7],
|
||
|
'std_test_score' : [0.02, 0.2, 0.],
|
||
|
'rank_test_score' : [3, 1, 1],
|
||
|
'split0_train_score' : [0.8, 0.9, 0.7],
|
||
|
'split1_train_score' : [0.82, 0.5, 0.7],
|
||
|
'mean_train_score' : [0.81, 0.7, 0.7],
|
||
|
'std_train_score' : [0.03, 0.03, 0.04],
|
||
|
'mean_fit_time' : [0.73, 0.63, 0.43, 0.49],
|
||
|
'std_fit_time' : [0.01, 0.02, 0.01, 0.01],
|
||
|
'mean_score_time' : [0.007, 0.06, 0.04, 0.04],
|
||
|
'std_score_time' : [0.001, 0.002, 0.003, 0.005],
|
||
|
'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...],
|
||
|
}
|
||
|
|
||
|
NOTE
|
||
|
|
||
|
The key ``'params'`` is used to store a list of parameter
|
||
|
settings dicts for all the parameter candidates.
|
||
|
|
||
|
The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
|
||
|
``std_score_time`` are all in seconds.
|
||
|
|
||
|
For multi-metric evaluation, the scores for all the scorers are
|
||
|
available in the ``cv_results_`` dict at the keys ending with that
|
||
|
scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` shown
|
||
|
above. ('split0_test_precision', 'mean_train_precision' etc.)
|
||
|
|
||
|
best_estimator_ : estimator or dict
|
||
|
Estimator that was chosen by the search, i.e. estimator
|
||
|
which gave highest score (or smallest loss if specified)
|
||
|
on the left out data. Not available if ``refit=False``.
|
||
|
|
||
|
For multi-metric evaluation, this attribute is present only if
|
||
|
``refit`` is specified.
|
||
|
|
||
|
See ``refit`` parameter for more information on allowed values.
|
||
|
|
||
|
best_score_ : float
|
||
|
Mean cross-validated score of the best_estimator.
|
||
|
|
||
|
For multi-metric evaluation, this is not available if ``refit`` is
|
||
|
``False``. See ``refit`` parameter for more information.
|
||
|
|
||
|
best_params_ : dict
|
||
|
Parameter setting that gave the best results on the hold out data.
|
||
|
|
||
|
For multi-metric evaluation, this is not available if ``refit`` is
|
||
|
``False``. See ``refit`` parameter for more information.
|
||
|
|
||
|
best_index_ : int
|
||
|
The index (of the ``cv_results_`` arrays) which corresponds to the best
|
||
|
candidate parameter setting.
|
||
|
|
||
|
The dict at ``search.cv_results_['params'][search.best_index_]`` gives
|
||
|
the parameter setting for the best model, that gives the highest
|
||
|
mean score (``search.best_score_``).
|
||
|
|
||
|
For multi-metric evaluation, this is not available if ``refit`` is
|
||
|
``False``. See ``refit`` parameter for more information.
|
||
|
|
||
|
scorer_ : function or a dict
|
||
|
Scorer function used on the held out data to choose the best
|
||
|
parameters for the model.
|
||
|
|
||
|
For multi-metric evaluation, this attribute holds the validated
|
||
|
``scoring`` dict which maps the scorer key to the scorer callable.
|
||
|
|
||
|
n_splits_ : int
|
||
|
The number of cross-validation splits (folds/iterations).
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
The parameters selected are those that maximize the score of the held-out
|
||
|
data, according to the scoring parameter.
|
||
|
|
||
|
If `n_jobs` was set to a value higher than one, the data is copied for each
|
||
|
parameter setting(and not `n_jobs` times). This is done for efficiency
|
||
|
reasons if individual jobs take very little time, but may raise errors if
|
||
|
the dataset is large and not enough memory is available. A workaround in
|
||
|
this case is to set `pre_dispatch`. Then, the memory is copied only
|
||
|
`pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
|
||
|
n_jobs`.
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
:class:`GridSearchCV`:
|
||
|
Does exhaustive search over a grid of parameters.
|
||
|
|
||
|
:class:`ParameterSampler`:
|
||
|
A generator over parameter settins, constructed from
|
||
|
param_distributions.
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, estimator, param_distributions, n_iter=10, scoring=None,
|
||
|
fit_params=None, n_jobs=1, iid=True, refit=True, cv=None,
|
||
|
verbose=0, pre_dispatch='2*n_jobs', random_state=None,
|
||
|
error_score='raise', return_train_score="warn"):
|
||
|
self.param_distributions = param_distributions
|
||
|
self.n_iter = n_iter
|
||
|
self.random_state = random_state
|
||
|
super(RandomizedSearchCV, self).__init__(
|
||
|
estimator=estimator, scoring=scoring, fit_params=fit_params,
|
||
|
n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
|
||
|
pre_dispatch=pre_dispatch, error_score=error_score,
|
||
|
return_train_score=return_train_score)
|
||
|
|
||
|
def _get_param_iterator(self):
|
||
|
"""Return ParameterSampler instance for the given distributions"""
|
||
|
return ParameterSampler(
|
||
|
self.param_distributions, self.n_iter,
|
||
|
random_state=self.random_state)
|