785 lines
28 KiB
Python
785 lines
28 KiB
Python
"""Utilities for input validation"""
|
|
|
|
# Authors: Olivier Grisel
|
|
# Gael Varoquaux
|
|
# Andreas Mueller
|
|
# Lars Buitinck
|
|
# Alexandre Gramfort
|
|
# Nicolas Tresegnie
|
|
# License: BSD 3 clause
|
|
|
|
import warnings
|
|
import numbers
|
|
|
|
import numpy as np
|
|
import scipy.sparse as sp
|
|
|
|
from ..externals import six
|
|
from ..utils.fixes import signature
|
|
from .. import get_config as _get_config
|
|
from ..exceptions import NonBLASDotWarning
|
|
from ..exceptions import NotFittedError
|
|
from ..exceptions import DataConversionWarning
|
|
from ..externals.joblib import Memory
|
|
|
|
|
|
FLOAT_DTYPES = (np.float64, np.float32, np.float16)
|
|
|
|
# Silenced by default to reduce verbosity. Turn on at runtime for
|
|
# performance profiling.
|
|
warnings.simplefilter('ignore', NonBLASDotWarning)
|
|
|
|
|
|
def _assert_all_finite(X):
|
|
"""Like assert_all_finite, but only for ndarray."""
|
|
if _get_config()['assume_finite']:
|
|
return
|
|
X = np.asanyarray(X)
|
|
# First try an O(n) time, O(1) space solution for the common case that
|
|
# everything is finite; fall back to O(n) space np.isfinite to prevent
|
|
# false positives from overflow in sum method.
|
|
if (X.dtype.char in np.typecodes['AllFloat'] and not np.isfinite(X.sum())
|
|
and not np.isfinite(X).all()):
|
|
raise ValueError("Input contains NaN, infinity"
|
|
" or a value too large for %r." % X.dtype)
|
|
|
|
|
|
def assert_all_finite(X):
|
|
"""Throw a ValueError if X contains NaN or infinity.
|
|
|
|
Parameters
|
|
----------
|
|
X : array or sparse matrix
|
|
"""
|
|
_assert_all_finite(X.data if sp.issparse(X) else X)
|
|
|
|
|
|
def as_float_array(X, copy=True, force_all_finite=True):
|
|
"""Converts an array-like to an array of floats.
|
|
|
|
The new dtype will be np.float32 or np.float64, depending on the original
|
|
type. The function can create a copy or modify the argument depending
|
|
on the argument copy.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}
|
|
|
|
copy : bool, optional
|
|
If True, a copy of X will be created. If False, a copy may still be
|
|
returned if X's dtype is not a floating point type.
|
|
|
|
force_all_finite : boolean (default=True)
|
|
Whether to raise an error on np.inf and np.nan in X.
|
|
|
|
Returns
|
|
-------
|
|
XT : {array, sparse matrix}
|
|
An array of type np.float
|
|
"""
|
|
if isinstance(X, np.matrix) or (not isinstance(X, np.ndarray)
|
|
and not sp.issparse(X)):
|
|
return check_array(X, ['csr', 'csc', 'coo'], dtype=np.float64,
|
|
copy=copy, force_all_finite=force_all_finite,
|
|
ensure_2d=False)
|
|
elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:
|
|
return X.copy() if copy else X
|
|
elif X.dtype in [np.float32, np.float64]: # is numpy array
|
|
return X.copy('F' if X.flags['F_CONTIGUOUS'] else 'C') if copy else X
|
|
else:
|
|
if X.dtype.kind in 'uib' and X.dtype.itemsize <= 4:
|
|
return_dtype = np.float32
|
|
else:
|
|
return_dtype = np.float64
|
|
return X.astype(return_dtype)
|
|
|
|
|
|
def _is_arraylike(x):
|
|
"""Returns whether the input is array-like"""
|
|
return (hasattr(x, '__len__') or
|
|
hasattr(x, 'shape') or
|
|
hasattr(x, '__array__'))
|
|
|
|
|
|
def _num_samples(x):
|
|
"""Return number of samples in array-like x."""
|
|
if hasattr(x, 'fit') and callable(x.fit):
|
|
# Don't get num_samples from an ensembles length!
|
|
raise TypeError('Expected sequence or array-like, got '
|
|
'estimator %s' % x)
|
|
if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
|
|
if hasattr(x, '__array__'):
|
|
x = np.asarray(x)
|
|
else:
|
|
raise TypeError("Expected sequence or array-like, got %s" %
|
|
type(x))
|
|
if hasattr(x, 'shape'):
|
|
if len(x.shape) == 0:
|
|
raise TypeError("Singleton array %r cannot be considered"
|
|
" a valid collection." % x)
|
|
return x.shape[0]
|
|
else:
|
|
return len(x)
|
|
|
|
|
|
def _shape_repr(shape):
|
|
"""Return a platform independent representation of an array shape
|
|
|
|
Under Python 2, the `long` type introduces an 'L' suffix when using the
|
|
default %r format for tuples of integers (typically used to store the shape
|
|
of an array).
|
|
|
|
Under Windows 64 bit (and Python 2), the `long` type is used by default
|
|
in numpy shapes even when the integer dimensions are well below 32 bit.
|
|
The platform specific type causes string messages or doctests to change
|
|
from one platform to another which is not desirable.
|
|
|
|
Under Python 3, there is no more `long` type so the `L` suffix is never
|
|
introduced in string representation.
|
|
|
|
>>> _shape_repr((1, 2))
|
|
'(1, 2)'
|
|
>>> one = 2 ** 64 / 2 ** 64 # force an upcast to `long` under Python 2
|
|
>>> _shape_repr((one, 2 * one))
|
|
'(1, 2)'
|
|
>>> _shape_repr((1,))
|
|
'(1,)'
|
|
>>> _shape_repr(())
|
|
'()'
|
|
"""
|
|
if len(shape) == 0:
|
|
return "()"
|
|
joined = ", ".join("%d" % e for e in shape)
|
|
if len(shape) == 1:
|
|
# special notation for singleton tuples
|
|
joined += ','
|
|
return "(%s)" % joined
|
|
|
|
|
|
def check_memory(memory):
|
|
"""Check that ``memory`` is joblib.Memory-like.
|
|
|
|
joblib.Memory-like means that ``memory`` can be converted into a
|
|
sklearn.externals.joblib.Memory instance (typically a str denoting the
|
|
``cachedir``) or has the same interface (has a ``cache`` method).
|
|
|
|
Parameters
|
|
----------
|
|
memory : None, str or object with the joblib.Memory interface
|
|
|
|
Returns
|
|
-------
|
|
memory : object with the joblib.Memory interface
|
|
|
|
Raises
|
|
------
|
|
ValueError
|
|
If ``memory`` is not joblib.Memory-like.
|
|
"""
|
|
|
|
if memory is None or isinstance(memory, six.string_types):
|
|
memory = Memory(cachedir=memory, verbose=0)
|
|
elif not hasattr(memory, 'cache'):
|
|
raise ValueError("'memory' should be None, a string or have the same"
|
|
" interface as sklearn.externals.joblib.Memory."
|
|
" Got memory='{}' instead.".format(memory))
|
|
return memory
|
|
|
|
|
|
def check_consistent_length(*arrays):
|
|
"""Check that all arrays have consistent first dimensions.
|
|
|
|
Checks whether all objects in arrays have the same shape or length.
|
|
|
|
Parameters
|
|
----------
|
|
*arrays : list or tuple of input objects.
|
|
Objects that will be checked for consistent length.
|
|
"""
|
|
|
|
lengths = [_num_samples(X) for X in arrays if X is not None]
|
|
uniques = np.unique(lengths)
|
|
if len(uniques) > 1:
|
|
raise ValueError("Found input variables with inconsistent numbers of"
|
|
" samples: %r" % [int(l) for l in lengths])
|
|
|
|
|
|
def indexable(*iterables):
|
|
"""Make arrays indexable for cross-validation.
|
|
|
|
Checks consistent length, passes through None, and ensures that everything
|
|
can be indexed by converting sparse matrices to csr and converting
|
|
non-interable objects to arrays.
|
|
|
|
Parameters
|
|
----------
|
|
*iterables : lists, dataframes, arrays, sparse matrices
|
|
List of objects to ensure sliceability.
|
|
"""
|
|
result = []
|
|
for X in iterables:
|
|
if sp.issparse(X):
|
|
result.append(X.tocsr())
|
|
elif hasattr(X, "__getitem__") or hasattr(X, "iloc"):
|
|
result.append(X)
|
|
elif X is None:
|
|
result.append(X)
|
|
else:
|
|
result.append(np.array(X))
|
|
check_consistent_length(*result)
|
|
return result
|
|
|
|
|
|
def _ensure_sparse_format(spmatrix, accept_sparse, dtype, copy,
|
|
force_all_finite):
|
|
"""Convert a sparse matrix to a given format.
|
|
|
|
Checks the sparse format of spmatrix and converts if necessary.
|
|
|
|
Parameters
|
|
----------
|
|
spmatrix : scipy sparse matrix
|
|
Input to validate and convert.
|
|
|
|
accept_sparse : string, boolean or list/tuple of strings
|
|
String[s] representing allowed sparse matrix formats ('csc',
|
|
'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but
|
|
not in the allowed format, it will be converted to the first listed
|
|
format. True allows the input to be any format. False means
|
|
that a sparse matrix input will raise an error.
|
|
|
|
dtype : string, type or None
|
|
Data type of result. If None, the dtype of the input is preserved.
|
|
|
|
copy : boolean
|
|
Whether a forced copy will be triggered. If copy=False, a copy might
|
|
be triggered by a conversion.
|
|
|
|
force_all_finite : boolean
|
|
Whether to raise an error on np.inf and np.nan in X.
|
|
|
|
Returns
|
|
-------
|
|
spmatrix_converted : scipy sparse matrix.
|
|
Matrix that is ensured to have an allowed type.
|
|
"""
|
|
if dtype is None:
|
|
dtype = spmatrix.dtype
|
|
|
|
changed_format = False
|
|
|
|
if isinstance(accept_sparse, six.string_types):
|
|
accept_sparse = [accept_sparse]
|
|
|
|
if accept_sparse is False:
|
|
raise TypeError('A sparse matrix was passed, but dense '
|
|
'data is required. Use X.toarray() to '
|
|
'convert to a dense numpy array.')
|
|
elif isinstance(accept_sparse, (list, tuple)):
|
|
if len(accept_sparse) == 0:
|
|
raise ValueError("When providing 'accept_sparse' "
|
|
"as a tuple or list, it must contain at "
|
|
"least one string value.")
|
|
# ensure correct sparse format
|
|
if spmatrix.format not in accept_sparse:
|
|
# create new with correct sparse
|
|
spmatrix = spmatrix.asformat(accept_sparse[0])
|
|
changed_format = True
|
|
elif accept_sparse is not True:
|
|
# any other type
|
|
raise ValueError("Parameter 'accept_sparse' should be a string, "
|
|
"boolean or list of strings. You provided "
|
|
"'accept_sparse={}'.".format(accept_sparse))
|
|
|
|
if dtype != spmatrix.dtype:
|
|
# convert dtype
|
|
spmatrix = spmatrix.astype(dtype)
|
|
elif copy and not changed_format:
|
|
# force copy
|
|
spmatrix = spmatrix.copy()
|
|
|
|
if force_all_finite:
|
|
if not hasattr(spmatrix, "data"):
|
|
warnings.warn("Can't check %s sparse matrix for nan or inf."
|
|
% spmatrix.format)
|
|
else:
|
|
_assert_all_finite(spmatrix.data)
|
|
return spmatrix
|
|
|
|
|
|
def check_array(array, accept_sparse=False, dtype="numeric", order=None,
|
|
copy=False, force_all_finite=True, ensure_2d=True,
|
|
allow_nd=False, ensure_min_samples=1, ensure_min_features=1,
|
|
warn_on_dtype=False, estimator=None):
|
|
"""Input validation on an array, list, sparse matrix or similar.
|
|
|
|
By default, the input is converted to an at least 2D numpy array.
|
|
If the dtype of the array is object, attempt converting to float,
|
|
raising on failure.
|
|
|
|
Parameters
|
|
----------
|
|
array : object
|
|
Input object to check / convert.
|
|
|
|
accept_sparse : string, boolean or list/tuple of strings (default=False)
|
|
String[s] representing allowed sparse matrix formats, such as 'csc',
|
|
'csr', etc. If the input is sparse but not in the allowed format,
|
|
it will be converted to the first listed format. True allows the input
|
|
to be any format. False means that a sparse matrix input will
|
|
raise an error.
|
|
|
|
.. deprecated:: 0.19
|
|
Passing 'None' to parameter ``accept_sparse`` in methods is
|
|
deprecated in version 0.19 "and will be removed in 0.21. Use
|
|
``accept_sparse=False`` instead.
|
|
|
|
dtype : string, type, list of types or None (default="numeric")
|
|
Data type of result. If None, the dtype of the input is preserved.
|
|
If "numeric", dtype is preserved unless array.dtype is object.
|
|
If dtype is a list of types, conversion on the first type is only
|
|
performed if the dtype of the input is not in the list.
|
|
|
|
order : 'F', 'C' or None (default=None)
|
|
Whether an array will be forced to be fortran or c-style.
|
|
When order is None (default), then if copy=False, nothing is ensured
|
|
about the memory layout of the output array; otherwise (copy=True)
|
|
the memory layout of the returned array is kept as close as possible
|
|
to the original array.
|
|
|
|
copy : boolean (default=False)
|
|
Whether a forced copy will be triggered. If copy=False, a copy might
|
|
be triggered by a conversion.
|
|
|
|
force_all_finite : boolean (default=True)
|
|
Whether to raise an error on np.inf and np.nan in X.
|
|
|
|
ensure_2d : boolean (default=True)
|
|
Whether to raise a value error if X is not 2d.
|
|
|
|
allow_nd : boolean (default=False)
|
|
Whether to allow X.ndim > 2.
|
|
|
|
ensure_min_samples : int (default=1)
|
|
Make sure that the array has a minimum number of samples in its first
|
|
axis (rows for a 2D array). Setting to 0 disables this check.
|
|
|
|
ensure_min_features : int (default=1)
|
|
Make sure that the 2D array has some minimum number of features
|
|
(columns). The default value of 1 rejects empty datasets.
|
|
This check is only enforced when the input data has effectively 2
|
|
dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
|
|
disables this check.
|
|
|
|
warn_on_dtype : boolean (default=False)
|
|
Raise DataConversionWarning if the dtype of the input data structure
|
|
does not match the requested dtype, causing a memory copy.
|
|
|
|
estimator : str or estimator instance (default=None)
|
|
If passed, include the name of the estimator in warning messages.
|
|
|
|
Returns
|
|
-------
|
|
X_converted : object
|
|
The converted and validated X.
|
|
|
|
"""
|
|
# accept_sparse 'None' deprecation check
|
|
if accept_sparse is None:
|
|
warnings.warn(
|
|
"Passing 'None' to parameter 'accept_sparse' in methods "
|
|
"check_array and check_X_y is deprecated in version 0.19 "
|
|
"and will be removed in 0.21. Use 'accept_sparse=False' "
|
|
" instead.", DeprecationWarning)
|
|
accept_sparse = False
|
|
|
|
# store whether originally we wanted numeric dtype
|
|
dtype_numeric = isinstance(dtype, six.string_types) and dtype == "numeric"
|
|
|
|
dtype_orig = getattr(array, "dtype", None)
|
|
if not hasattr(dtype_orig, 'kind'):
|
|
# not a data type (e.g. a column named dtype in a pandas DataFrame)
|
|
dtype_orig = None
|
|
|
|
if dtype_numeric:
|
|
if dtype_orig is not None and dtype_orig.kind == "O":
|
|
# if input is object, convert to float.
|
|
dtype = np.float64
|
|
else:
|
|
dtype = None
|
|
|
|
if isinstance(dtype, (list, tuple)):
|
|
if dtype_orig is not None and dtype_orig in dtype:
|
|
# no dtype conversion required
|
|
dtype = None
|
|
else:
|
|
# dtype conversion required. Let's select the first element of the
|
|
# list of accepted types.
|
|
dtype = dtype[0]
|
|
|
|
if estimator is not None:
|
|
if isinstance(estimator, six.string_types):
|
|
estimator_name = estimator
|
|
else:
|
|
estimator_name = estimator.__class__.__name__
|
|
else:
|
|
estimator_name = "Estimator"
|
|
context = " by %s" % estimator_name if estimator is not None else ""
|
|
|
|
if sp.issparse(array):
|
|
array = _ensure_sparse_format(array, accept_sparse, dtype, copy,
|
|
force_all_finite)
|
|
else:
|
|
array = np.array(array, dtype=dtype, order=order, copy=copy)
|
|
|
|
if ensure_2d:
|
|
if array.ndim == 1:
|
|
raise ValueError(
|
|
"Expected 2D array, got 1D array instead:\narray={}.\n"
|
|
"Reshape your data either using array.reshape(-1, 1) if "
|
|
"your data has a single feature or array.reshape(1, -1) "
|
|
"if it contains a single sample.".format(array))
|
|
array = np.atleast_2d(array)
|
|
# To ensure that array flags are maintained
|
|
array = np.array(array, dtype=dtype, order=order, copy=copy)
|
|
|
|
# make sure we actually converted to numeric:
|
|
if dtype_numeric and array.dtype.kind == "O":
|
|
array = array.astype(np.float64)
|
|
if not allow_nd and array.ndim >= 3:
|
|
raise ValueError("Found array with dim %d. %s expected <= 2."
|
|
% (array.ndim, estimator_name))
|
|
if force_all_finite:
|
|
_assert_all_finite(array)
|
|
|
|
shape_repr = _shape_repr(array.shape)
|
|
if ensure_min_samples > 0:
|
|
n_samples = _num_samples(array)
|
|
if n_samples < ensure_min_samples:
|
|
raise ValueError("Found array with %d sample(s) (shape=%s) while a"
|
|
" minimum of %d is required%s."
|
|
% (n_samples, shape_repr, ensure_min_samples,
|
|
context))
|
|
|
|
if ensure_min_features > 0 and array.ndim == 2:
|
|
n_features = array.shape[1]
|
|
if n_features < ensure_min_features:
|
|
raise ValueError("Found array with %d feature(s) (shape=%s) while"
|
|
" a minimum of %d is required%s."
|
|
% (n_features, shape_repr, ensure_min_features,
|
|
context))
|
|
|
|
if warn_on_dtype and dtype_orig is not None and array.dtype != dtype_orig:
|
|
msg = ("Data with input dtype %s was converted to %s%s."
|
|
% (dtype_orig, array.dtype, context))
|
|
warnings.warn(msg, DataConversionWarning)
|
|
return array
|
|
|
|
|
|
def check_X_y(X, y, accept_sparse=False, dtype="numeric", order=None,
|
|
copy=False, force_all_finite=True, ensure_2d=True,
|
|
allow_nd=False, multi_output=False, ensure_min_samples=1,
|
|
ensure_min_features=1, y_numeric=False,
|
|
warn_on_dtype=False, estimator=None):
|
|
"""Input validation for standard estimators.
|
|
|
|
Checks X and y for consistent length, enforces X 2d and y 1d.
|
|
Standard input checks are only applied to y, such as checking that y
|
|
does not have np.nan or np.inf targets. For multi-label y, set
|
|
multi_output=True to allow 2d and sparse y. If the dtype of X is
|
|
object, attempt converting to float, raising on failure.
|
|
|
|
Parameters
|
|
----------
|
|
X : nd-array, list or sparse matrix
|
|
Input data.
|
|
|
|
y : nd-array, list or sparse matrix
|
|
Labels.
|
|
|
|
accept_sparse : string, boolean or list of string (default=False)
|
|
String[s] representing allowed sparse matrix formats, such as 'csc',
|
|
'csr', etc. If the input is sparse but not in the allowed format,
|
|
it will be converted to the first listed format. True allows the input
|
|
to be any format. False means that a sparse matrix input will
|
|
raise an error.
|
|
|
|
.. deprecated:: 0.19
|
|
Passing 'None' to parameter ``accept_sparse`` in methods is
|
|
deprecated in version 0.19 "and will be removed in 0.21. Use
|
|
``accept_sparse=False`` instead.
|
|
|
|
dtype : string, type, list of types or None (default="numeric")
|
|
Data type of result. If None, the dtype of the input is preserved.
|
|
If "numeric", dtype is preserved unless array.dtype is object.
|
|
If dtype is a list of types, conversion on the first type is only
|
|
performed if the dtype of the input is not in the list.
|
|
|
|
order : 'F', 'C' or None (default=None)
|
|
Whether an array will be forced to be fortran or c-style.
|
|
|
|
copy : boolean (default=False)
|
|
Whether a forced copy will be triggered. If copy=False, a copy might
|
|
be triggered by a conversion.
|
|
|
|
force_all_finite : boolean (default=True)
|
|
Whether to raise an error on np.inf and np.nan in X. This parameter
|
|
does not influence whether y can have np.inf or np.nan values.
|
|
|
|
ensure_2d : boolean (default=True)
|
|
Whether to make X at least 2d.
|
|
|
|
allow_nd : boolean (default=False)
|
|
Whether to allow X.ndim > 2.
|
|
|
|
multi_output : boolean (default=False)
|
|
Whether to allow 2-d y (array or sparse matrix). If false, y will be
|
|
validated as a vector. y cannot have np.nan or np.inf values if
|
|
multi_output=True.
|
|
|
|
ensure_min_samples : int (default=1)
|
|
Make sure that X has a minimum number of samples in its first
|
|
axis (rows for a 2D array).
|
|
|
|
ensure_min_features : int (default=1)
|
|
Make sure that the 2D array has some minimum number of features
|
|
(columns). The default value of 1 rejects empty datasets.
|
|
This check is only enforced when X has effectively 2 dimensions or
|
|
is originally 1D and ``ensure_2d`` is True. Setting to 0 disables
|
|
this check.
|
|
|
|
y_numeric : boolean (default=False)
|
|
Whether to ensure that y has a numeric type. If dtype of y is object,
|
|
it is converted to float64. Should only be used for regression
|
|
algorithms.
|
|
|
|
warn_on_dtype : boolean (default=False)
|
|
Raise DataConversionWarning if the dtype of the input data structure
|
|
does not match the requested dtype, causing a memory copy.
|
|
|
|
estimator : str or estimator instance (default=None)
|
|
If passed, include the name of the estimator in warning messages.
|
|
|
|
Returns
|
|
-------
|
|
X_converted : object
|
|
The converted and validated X.
|
|
|
|
y_converted : object
|
|
The converted and validated y.
|
|
"""
|
|
X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,
|
|
ensure_2d, allow_nd, ensure_min_samples,
|
|
ensure_min_features, warn_on_dtype, estimator)
|
|
if multi_output:
|
|
y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,
|
|
dtype=None)
|
|
else:
|
|
y = column_or_1d(y, warn=True)
|
|
_assert_all_finite(y)
|
|
if y_numeric and y.dtype.kind == 'O':
|
|
y = y.astype(np.float64)
|
|
|
|
check_consistent_length(X, y)
|
|
|
|
return X, y
|
|
|
|
|
|
def column_or_1d(y, warn=False):
|
|
""" Ravel column or 1d numpy array, else raises an error
|
|
|
|
Parameters
|
|
----------
|
|
y : array-like
|
|
|
|
warn : boolean, default False
|
|
To control display of warnings.
|
|
|
|
Returns
|
|
-------
|
|
y : array
|
|
|
|
"""
|
|
shape = np.shape(y)
|
|
if len(shape) == 1:
|
|
return np.ravel(y)
|
|
if len(shape) == 2 and shape[1] == 1:
|
|
if warn:
|
|
warnings.warn("A column-vector y was passed when a 1d array was"
|
|
" expected. Please change the shape of y to "
|
|
"(n_samples, ), for example using ravel().",
|
|
DataConversionWarning, stacklevel=2)
|
|
return np.ravel(y)
|
|
|
|
raise ValueError("bad input shape {0}".format(shape))
|
|
|
|
|
|
def check_random_state(seed):
|
|
"""Turn seed into a np.random.RandomState instance
|
|
|
|
Parameters
|
|
----------
|
|
seed : None | int | instance of RandomState
|
|
If seed is None, return the RandomState singleton used by np.random.
|
|
If seed is an int, return a new RandomState instance seeded with seed.
|
|
If seed is already a RandomState instance, return it.
|
|
Otherwise raise ValueError.
|
|
"""
|
|
if seed is None or seed is np.random:
|
|
return np.random.mtrand._rand
|
|
if isinstance(seed, (numbers.Integral, np.integer)):
|
|
return np.random.RandomState(seed)
|
|
if isinstance(seed, np.random.RandomState):
|
|
return seed
|
|
raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
|
|
' instance' % seed)
|
|
|
|
|
|
def has_fit_parameter(estimator, parameter):
|
|
"""Checks whether the estimator's fit method supports the given parameter.
|
|
|
|
Parameters
|
|
----------
|
|
estimator : object
|
|
An estimator to inspect.
|
|
|
|
parameter: str
|
|
The searched parameter.
|
|
|
|
Returns
|
|
-------
|
|
is_parameter: bool
|
|
Whether the parameter was found to be a named parameter of the
|
|
estimator's fit method.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.svm import SVC
|
|
>>> has_fit_parameter(SVC(), "sample_weight")
|
|
True
|
|
|
|
"""
|
|
return parameter in signature(estimator.fit).parameters
|
|
|
|
|
|
def check_symmetric(array, tol=1E-10, raise_warning=True,
|
|
raise_exception=False):
|
|
"""Make sure that array is 2D, square and symmetric.
|
|
|
|
If the array is not symmetric, then a symmetrized version is returned.
|
|
Optionally, a warning or exception is raised if the matrix is not
|
|
symmetric.
|
|
|
|
Parameters
|
|
----------
|
|
array : nd-array or sparse matrix
|
|
Input object to check / convert. Must be two-dimensional and square,
|
|
otherwise a ValueError will be raised.
|
|
tol : float
|
|
Absolute tolerance for equivalence of arrays. Default = 1E-10.
|
|
raise_warning : boolean (default=True)
|
|
If True then raise a warning if conversion is required.
|
|
raise_exception : boolean (default=False)
|
|
If True then raise an exception if array is not symmetric.
|
|
|
|
Returns
|
|
-------
|
|
array_sym : ndarray or sparse matrix
|
|
Symmetrized version of the input array, i.e. the average of array
|
|
and array.transpose(). If sparse, then duplicate entries are first
|
|
summed and zeros are eliminated.
|
|
"""
|
|
if (array.ndim != 2) or (array.shape[0] != array.shape[1]):
|
|
raise ValueError("array must be 2-dimensional and square. "
|
|
"shape = {0}".format(array.shape))
|
|
|
|
if sp.issparse(array):
|
|
diff = array - array.T
|
|
# only csr, csc, and coo have `data` attribute
|
|
if diff.format not in ['csr', 'csc', 'coo']:
|
|
diff = diff.tocsr()
|
|
symmetric = np.all(abs(diff.data) < tol)
|
|
else:
|
|
symmetric = np.allclose(array, array.T, atol=tol)
|
|
|
|
if not symmetric:
|
|
if raise_exception:
|
|
raise ValueError("Array must be symmetric")
|
|
if raise_warning:
|
|
warnings.warn("Array is not symmetric, and will be converted "
|
|
"to symmetric by average with its transpose.")
|
|
if sp.issparse(array):
|
|
conversion = 'to' + array.format
|
|
array = getattr(0.5 * (array + array.T), conversion)()
|
|
else:
|
|
array = 0.5 * (array + array.T)
|
|
|
|
return array
|
|
|
|
|
|
def check_is_fitted(estimator, attributes, msg=None, all_or_any=all):
|
|
"""Perform is_fitted validation for estimator.
|
|
|
|
Checks if the estimator is fitted by verifying the presence of
|
|
"all_or_any" of the passed attributes and raises a NotFittedError with the
|
|
given message.
|
|
|
|
Parameters
|
|
----------
|
|
estimator : estimator instance.
|
|
estimator instance for which the check is performed.
|
|
|
|
attributes : attribute name(s) given as string or a list/tuple of strings
|
|
Eg.:
|
|
``["coef_", "estimator_", ...], "coef_"``
|
|
|
|
msg : string
|
|
The default error message is, "This %(name)s instance is not fitted
|
|
yet. Call 'fit' with appropriate arguments before using this method."
|
|
|
|
For custom messages if "%(name)s" is present in the message string,
|
|
it is substituted for the estimator name.
|
|
|
|
Eg. : "Estimator, %(name)s, must be fitted before sparsifying".
|
|
|
|
all_or_any : callable, {all, any}, default all
|
|
Specify whether all or any of the given attributes must exist.
|
|
|
|
Returns
|
|
-------
|
|
None
|
|
|
|
Raises
|
|
------
|
|
NotFittedError
|
|
If the attributes are not found.
|
|
"""
|
|
if msg is None:
|
|
msg = ("This %(name)s instance is not fitted yet. Call 'fit' with "
|
|
"appropriate arguments before using this method.")
|
|
|
|
if not hasattr(estimator, 'fit'):
|
|
raise TypeError("%s is not an estimator instance." % (estimator))
|
|
|
|
if not isinstance(attributes, (list, tuple)):
|
|
attributes = [attributes]
|
|
|
|
if not all_or_any([hasattr(estimator, attr) for attr in attributes]):
|
|
raise NotFittedError(msg % {'name': type(estimator).__name__})
|
|
|
|
|
|
def check_non_negative(X, whom):
|
|
"""
|
|
Check if there is any negative value in an array.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like or sparse matrix
|
|
Input data.
|
|
|
|
whom : string
|
|
Who passed X to this function.
|
|
"""
|
|
X = X.data if sp.issparse(X) else X
|
|
if (X < 0).any():
|
|
raise ValueError("Negative values in data passed to %s" % whom)
|