509 lines
15 KiB
Python
509 lines
15 KiB
Python
|
"""
|
||
|
The :mod:`sklearn.utils` module includes various utilities.
|
||
|
"""
|
||
|
|
||
|
import numpy as np
|
||
|
from scipy.sparse import issparse
|
||
|
import warnings
|
||
|
|
||
|
from .murmurhash import murmurhash3_32
|
||
|
from .validation import (as_float_array,
|
||
|
assert_all_finite,
|
||
|
check_random_state, column_or_1d, check_array,
|
||
|
check_consistent_length, check_X_y, indexable,
|
||
|
check_symmetric)
|
||
|
from .class_weight import compute_class_weight, compute_sample_weight
|
||
|
from ..externals.joblib import cpu_count
|
||
|
from ..exceptions import DataConversionWarning
|
||
|
from ..utils.fixes import _Sequence as Sequence
|
||
|
from .deprecation import deprecated
|
||
|
|
||
|
|
||
|
__all__ = ["murmurhash3_32", "as_float_array",
|
||
|
"assert_all_finite", "check_array",
|
||
|
"check_random_state",
|
||
|
"compute_class_weight", "compute_sample_weight",
|
||
|
"column_or_1d", "safe_indexing",
|
||
|
"check_consistent_length", "check_X_y", 'indexable',
|
||
|
"check_symmetric", "indices_to_mask", "deprecated"]
|
||
|
|
||
|
|
||
|
class Bunch(dict):
|
||
|
"""Container object for datasets
|
||
|
|
||
|
Dictionary-like object that exposes its keys as attributes.
|
||
|
|
||
|
>>> b = Bunch(a=1, b=2)
|
||
|
>>> b['b']
|
||
|
2
|
||
|
>>> b.b
|
||
|
2
|
||
|
>>> b.a = 3
|
||
|
>>> b['a']
|
||
|
3
|
||
|
>>> b.c = 6
|
||
|
>>> b['c']
|
||
|
6
|
||
|
|
||
|
"""
|
||
|
|
||
|
def __init__(self, **kwargs):
|
||
|
super(Bunch, self).__init__(kwargs)
|
||
|
|
||
|
def __setattr__(self, key, value):
|
||
|
self[key] = value
|
||
|
|
||
|
def __dir__(self):
|
||
|
return self.keys()
|
||
|
|
||
|
def __getattr__(self, key):
|
||
|
try:
|
||
|
return self[key]
|
||
|
except KeyError:
|
||
|
raise AttributeError(key)
|
||
|
|
||
|
def __setstate__(self, state):
|
||
|
# Bunch pickles generated with scikit-learn 0.16.* have an non
|
||
|
# empty __dict__. This causes a surprising behaviour when
|
||
|
# loading these pickles scikit-learn 0.17: reading bunch.key
|
||
|
# uses __dict__ but assigning to bunch.key use __setattr__ and
|
||
|
# only changes bunch['key']. More details can be found at:
|
||
|
# https://github.com/scikit-learn/scikit-learn/issues/6196.
|
||
|
# Overriding __setstate__ to be a noop has the effect of
|
||
|
# ignoring the pickled __dict__
|
||
|
pass
|
||
|
|
||
|
|
||
|
def safe_mask(X, mask):
|
||
|
"""Return a mask which is safe to use on X.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : {array-like, sparse matrix}
|
||
|
Data on which to apply mask.
|
||
|
|
||
|
mask : array
|
||
|
Mask to be used on X.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
mask
|
||
|
"""
|
||
|
mask = np.asarray(mask)
|
||
|
if np.issubdtype(mask.dtype, np.signedinteger):
|
||
|
return mask
|
||
|
|
||
|
if hasattr(X, "toarray"):
|
||
|
ind = np.arange(mask.shape[0])
|
||
|
mask = ind[mask]
|
||
|
return mask
|
||
|
|
||
|
|
||
|
def axis0_safe_slice(X, mask, len_mask):
|
||
|
"""
|
||
|
This mask is safer than safe_mask since it returns an
|
||
|
empty array, when a sparse matrix is sliced with a boolean mask
|
||
|
with all False, instead of raising an unhelpful error in older
|
||
|
versions of SciPy.
|
||
|
|
||
|
See: https://github.com/scipy/scipy/issues/5361
|
||
|
|
||
|
Also note that we can avoid doing the dot product by checking if
|
||
|
the len_mask is not zero in _huber_loss_and_gradient but this
|
||
|
is not going to be the bottleneck, since the number of outliers
|
||
|
and non_outliers are typically non-zero and it makes the code
|
||
|
tougher to follow.
|
||
|
"""
|
||
|
if len_mask != 0:
|
||
|
return X[safe_mask(X, mask), :]
|
||
|
return np.zeros(shape=(0, X.shape[1]))
|
||
|
|
||
|
|
||
|
def safe_indexing(X, indices):
|
||
|
"""Return items or rows from X using indices.
|
||
|
|
||
|
Allows simple indexing of lists or arrays.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series.
|
||
|
Data from which to sample rows or items.
|
||
|
indices : array-like of int
|
||
|
Indices according to which X will be subsampled.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
subset
|
||
|
Subset of X on first axis
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
|
||
|
not supported.
|
||
|
"""
|
||
|
if hasattr(X, "iloc"):
|
||
|
# Work-around for indexing with read-only indices in pandas
|
||
|
indices = indices if indices.flags.writeable else indices.copy()
|
||
|
# Pandas Dataframes and Series
|
||
|
try:
|
||
|
return X.iloc[indices]
|
||
|
except ValueError:
|
||
|
# Cython typed memoryviews internally used in pandas do not support
|
||
|
# readonly buffers.
|
||
|
warnings.warn("Copying input dataframe for slicing.",
|
||
|
DataConversionWarning)
|
||
|
return X.copy().iloc[indices]
|
||
|
elif hasattr(X, "shape"):
|
||
|
if hasattr(X, 'take') and (hasattr(indices, 'dtype') and
|
||
|
indices.dtype.kind == 'i'):
|
||
|
# This is often substantially faster than X[indices]
|
||
|
return X.take(indices, axis=0)
|
||
|
else:
|
||
|
return X[indices]
|
||
|
else:
|
||
|
return [X[idx] for idx in indices]
|
||
|
|
||
|
|
||
|
def resample(*arrays, **options):
|
||
|
"""Resample arrays or sparse matrices in a consistent way
|
||
|
|
||
|
The default strategy implements one step of the bootstrapping
|
||
|
procedure.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
*arrays : sequence of indexable data-structures
|
||
|
Indexable data-structures can be arrays, lists, dataframes or scipy
|
||
|
sparse matrices with consistent first dimension.
|
||
|
|
||
|
replace : boolean, True by default
|
||
|
Implements resampling with replacement. If False, this will implement
|
||
|
(sliced) random permutations.
|
||
|
|
||
|
n_samples : int, None by default
|
||
|
Number of samples to generate. If left to None this is
|
||
|
automatically set to the first dimension of the arrays.
|
||
|
If replace is False it should not be larger than the length of
|
||
|
arrays.
|
||
|
|
||
|
random_state : int, RandomState instance or None, optional (default=None)
|
||
|
The seed of the pseudo random number generator to use when shuffling
|
||
|
the data. If int, random_state is the seed used by the random number
|
||
|
generator; If RandomState instance, random_state is the random number
|
||
|
generator; If None, the random number generator is the RandomState
|
||
|
instance used by `np.random`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
resampled_arrays : sequence of indexable data-structures
|
||
|
Sequence of resampled views of the collections. The original arrays are
|
||
|
not impacted.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
It is possible to mix sparse and dense arrays in the same run::
|
||
|
|
||
|
>>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
|
||
|
>>> y = np.array([0, 1, 2])
|
||
|
|
||
|
>>> from scipy.sparse import coo_matrix
|
||
|
>>> X_sparse = coo_matrix(X)
|
||
|
|
||
|
>>> from sklearn.utils import resample
|
||
|
>>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0)
|
||
|
>>> X
|
||
|
array([[ 1., 0.],
|
||
|
[ 2., 1.],
|
||
|
[ 1., 0.]])
|
||
|
|
||
|
>>> X_sparse # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
|
||
|
<3x2 sparse matrix of type '<... 'numpy.float64'>'
|
||
|
with 4 stored elements in Compressed Sparse Row format>
|
||
|
|
||
|
>>> X_sparse.toarray()
|
||
|
array([[ 1., 0.],
|
||
|
[ 2., 1.],
|
||
|
[ 1., 0.]])
|
||
|
|
||
|
>>> y
|
||
|
array([0, 1, 0])
|
||
|
|
||
|
>>> resample(y, n_samples=2, random_state=0)
|
||
|
array([0, 1])
|
||
|
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
:func:`sklearn.utils.shuffle`
|
||
|
"""
|
||
|
random_state = check_random_state(options.pop('random_state', None))
|
||
|
replace = options.pop('replace', True)
|
||
|
max_n_samples = options.pop('n_samples', None)
|
||
|
if options:
|
||
|
raise ValueError("Unexpected kw arguments: %r" % options.keys())
|
||
|
|
||
|
if len(arrays) == 0:
|
||
|
return None
|
||
|
|
||
|
first = arrays[0]
|
||
|
n_samples = first.shape[0] if hasattr(first, 'shape') else len(first)
|
||
|
|
||
|
if max_n_samples is None:
|
||
|
max_n_samples = n_samples
|
||
|
elif (max_n_samples > n_samples) and (not replace):
|
||
|
raise ValueError("Cannot sample %d out of arrays with dim %d "
|
||
|
"when replace is False" % (max_n_samples,
|
||
|
n_samples))
|
||
|
|
||
|
check_consistent_length(*arrays)
|
||
|
|
||
|
if replace:
|
||
|
indices = random_state.randint(0, n_samples, size=(max_n_samples,))
|
||
|
else:
|
||
|
indices = np.arange(n_samples)
|
||
|
random_state.shuffle(indices)
|
||
|
indices = indices[:max_n_samples]
|
||
|
|
||
|
# convert sparse matrices to CSR for row-based indexing
|
||
|
arrays = [a.tocsr() if issparse(a) else a for a in arrays]
|
||
|
resampled_arrays = [safe_indexing(a, indices) for a in arrays]
|
||
|
if len(resampled_arrays) == 1:
|
||
|
# syntactic sugar for the unit argument case
|
||
|
return resampled_arrays[0]
|
||
|
else:
|
||
|
return resampled_arrays
|
||
|
|
||
|
|
||
|
def shuffle(*arrays, **options):
|
||
|
"""Shuffle arrays or sparse matrices in a consistent way
|
||
|
|
||
|
This is a convenience alias to ``resample(*arrays, replace=False)`` to do
|
||
|
random permutations of the collections.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
*arrays : sequence of indexable data-structures
|
||
|
Indexable data-structures can be arrays, lists, dataframes or scipy
|
||
|
sparse matrices with consistent first dimension.
|
||
|
|
||
|
random_state : int, RandomState instance or None, optional (default=None)
|
||
|
The seed of the pseudo random number generator to use when shuffling
|
||
|
the data. If int, random_state is the seed used by the random number
|
||
|
generator; If RandomState instance, random_state is the random number
|
||
|
generator; If None, the random number generator is the RandomState
|
||
|
instance used by `np.random`.
|
||
|
|
||
|
n_samples : int, None by default
|
||
|
Number of samples to generate. If left to None this is
|
||
|
automatically set to the first dimension of the arrays.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
shuffled_arrays : sequence of indexable data-structures
|
||
|
Sequence of shuffled views of the collections. The original arrays are
|
||
|
not impacted.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
It is possible to mix sparse and dense arrays in the same run::
|
||
|
|
||
|
>>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
|
||
|
>>> y = np.array([0, 1, 2])
|
||
|
|
||
|
>>> from scipy.sparse import coo_matrix
|
||
|
>>> X_sparse = coo_matrix(X)
|
||
|
|
||
|
>>> from sklearn.utils import shuffle
|
||
|
>>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)
|
||
|
>>> X
|
||
|
array([[ 0., 0.],
|
||
|
[ 2., 1.],
|
||
|
[ 1., 0.]])
|
||
|
|
||
|
>>> X_sparse # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
|
||
|
<3x2 sparse matrix of type '<... 'numpy.float64'>'
|
||
|
with 3 stored elements in Compressed Sparse Row format>
|
||
|
|
||
|
>>> X_sparse.toarray()
|
||
|
array([[ 0., 0.],
|
||
|
[ 2., 1.],
|
||
|
[ 1., 0.]])
|
||
|
|
||
|
>>> y
|
||
|
array([2, 1, 0])
|
||
|
|
||
|
>>> shuffle(y, n_samples=2, random_state=0)
|
||
|
array([0, 1])
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
:func:`sklearn.utils.resample`
|
||
|
"""
|
||
|
options['replace'] = False
|
||
|
return resample(*arrays, **options)
|
||
|
|
||
|
|
||
|
def safe_sqr(X, copy=True):
|
||
|
"""Element wise squaring of array-likes and sparse matrices.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array like, matrix, sparse matrix
|
||
|
|
||
|
copy : boolean, optional, default True
|
||
|
Whether to create a copy of X and operate on it or to perform
|
||
|
inplace computation (default behaviour).
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
X ** 2 : element wise square
|
||
|
"""
|
||
|
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], ensure_2d=False)
|
||
|
if issparse(X):
|
||
|
if copy:
|
||
|
X = X.copy()
|
||
|
X.data **= 2
|
||
|
else:
|
||
|
if copy:
|
||
|
X = X ** 2
|
||
|
else:
|
||
|
X **= 2
|
||
|
return X
|
||
|
|
||
|
|
||
|
def gen_batches(n, batch_size):
|
||
|
"""Generator to create slices containing batch_size elements, from 0 to n.
|
||
|
|
||
|
The last slice may contain less than batch_size elements, when batch_size
|
||
|
does not divide n.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.utils import gen_batches
|
||
|
>>> list(gen_batches(7, 3))
|
||
|
[slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
|
||
|
>>> list(gen_batches(6, 3))
|
||
|
[slice(0, 3, None), slice(3, 6, None)]
|
||
|
>>> list(gen_batches(2, 3))
|
||
|
[slice(0, 2, None)]
|
||
|
"""
|
||
|
start = 0
|
||
|
for _ in range(int(n // batch_size)):
|
||
|
end = start + batch_size
|
||
|
yield slice(start, end)
|
||
|
start = end
|
||
|
if start < n:
|
||
|
yield slice(start, n)
|
||
|
|
||
|
|
||
|
def gen_even_slices(n, n_packs, n_samples=None):
|
||
|
"""Generator to create n_packs slices going up to n.
|
||
|
|
||
|
Pass n_samples when the slices are to be used for sparse matrix indexing;
|
||
|
slicing off-the-end raises an exception, while it works for NumPy arrays.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.utils import gen_even_slices
|
||
|
>>> list(gen_even_slices(10, 1))
|
||
|
[slice(0, 10, None)]
|
||
|
>>> list(gen_even_slices(10, 10)) #doctest: +ELLIPSIS
|
||
|
[slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)]
|
||
|
>>> list(gen_even_slices(10, 5)) #doctest: +ELLIPSIS
|
||
|
[slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)]
|
||
|
>>> list(gen_even_slices(10, 3))
|
||
|
[slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)]
|
||
|
"""
|
||
|
start = 0
|
||
|
if n_packs < 1:
|
||
|
raise ValueError("gen_even_slices got n_packs=%s, must be >=1"
|
||
|
% n_packs)
|
||
|
for pack_num in range(n_packs):
|
||
|
this_n = n // n_packs
|
||
|
if pack_num < n % n_packs:
|
||
|
this_n += 1
|
||
|
if this_n > 0:
|
||
|
end = start + this_n
|
||
|
if n_samples is not None:
|
||
|
end = min(n_samples, end)
|
||
|
yield slice(start, end, None)
|
||
|
start = end
|
||
|
|
||
|
|
||
|
def _get_n_jobs(n_jobs):
|
||
|
"""Get number of jobs for the computation.
|
||
|
|
||
|
This function reimplements the logic of joblib to determine the actual
|
||
|
number of jobs depending on the cpu count. If -1 all CPUs are used.
|
||
|
If 1 is given, no parallel computing code is used at all, which is useful
|
||
|
for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
|
||
|
Thus for n_jobs = -2, all CPUs but one are used.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
n_jobs : int
|
||
|
Number of jobs stated in joblib convention.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
n_jobs : int
|
||
|
The actual number of jobs as positive integer.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from sklearn.utils import _get_n_jobs
|
||
|
>>> _get_n_jobs(4)
|
||
|
4
|
||
|
>>> jobs = _get_n_jobs(-2)
|
||
|
>>> assert jobs == max(cpu_count() - 1, 1)
|
||
|
>>> _get_n_jobs(0)
|
||
|
Traceback (most recent call last):
|
||
|
...
|
||
|
ValueError: Parameter n_jobs == 0 has no meaning.
|
||
|
"""
|
||
|
if n_jobs < 0:
|
||
|
return max(cpu_count() + 1 + n_jobs, 1)
|
||
|
elif n_jobs == 0:
|
||
|
raise ValueError('Parameter n_jobs == 0 has no meaning.')
|
||
|
else:
|
||
|
return n_jobs
|
||
|
|
||
|
|
||
|
def tosequence(x):
|
||
|
"""Cast iterable x to a Sequence, avoiding a copy if possible.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
x : iterable
|
||
|
"""
|
||
|
if isinstance(x, np.ndarray):
|
||
|
return np.asarray(x)
|
||
|
elif isinstance(x, Sequence):
|
||
|
return x
|
||
|
else:
|
||
|
return list(x)
|
||
|
|
||
|
|
||
|
def indices_to_mask(indices, mask_length):
|
||
|
"""Convert list of indices to boolean mask.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
indices : list-like
|
||
|
List of integers treated as indices.
|
||
|
mask_length : int
|
||
|
Length of boolean mask to be generated.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
mask : 1d boolean nd-array
|
||
|
Boolean array that is True where indices are present, else False.
|
||
|
"""
|
||
|
if mask_length <= np.max(indices):
|
||
|
raise ValueError("mask_length must be greater than max(indices)")
|
||
|
|
||
|
mask = np.zeros(mask_length, dtype=np.bool)
|
||
|
mask[indices] = True
|
||
|
|
||
|
return mask
|