""" The :mod:`sklearn.utils` module includes various utilities. """ import numpy as np from scipy.sparse import issparse import warnings from .murmurhash import murmurhash3_32 from .validation import (as_float_array, assert_all_finite, check_random_state, column_or_1d, check_array, check_consistent_length, check_X_y, indexable, check_symmetric) from .class_weight import compute_class_weight, compute_sample_weight from ..externals.joblib import cpu_count from ..exceptions import DataConversionWarning from ..utils.fixes import _Sequence as Sequence from .deprecation import deprecated __all__ = ["murmurhash3_32", "as_float_array", "assert_all_finite", "check_array", "check_random_state", "compute_class_weight", "compute_sample_weight", "column_or_1d", "safe_indexing", "check_consistent_length", "check_X_y", 'indexable', "check_symmetric", "indices_to_mask", "deprecated"] class Bunch(dict): """Container object for datasets Dictionary-like object that exposes its keys as attributes. >>> b = Bunch(a=1, b=2) >>> b['b'] 2 >>> b.b 2 >>> b.a = 3 >>> b['a'] 3 >>> b.c = 6 >>> b['c'] 6 """ def __init__(self, **kwargs): super(Bunch, self).__init__(kwargs) def __setattr__(self, key, value): self[key] = value def __dir__(self): return self.keys() def __getattr__(self, key): try: return self[key] except KeyError: raise AttributeError(key) def __setstate__(self, state): # Bunch pickles generated with scikit-learn 0.16.* have an non # empty __dict__. This causes a surprising behaviour when # loading these pickles scikit-learn 0.17: reading bunch.key # uses __dict__ but assigning to bunch.key use __setattr__ and # only changes bunch['key']. More details can be found at: # https://github.com/scikit-learn/scikit-learn/issues/6196. # Overriding __setstate__ to be a noop has the effect of # ignoring the pickled __dict__ pass def safe_mask(X, mask): """Return a mask which is safe to use on X. Parameters ---------- X : {array-like, sparse matrix} Data on which to apply mask. mask : array Mask to be used on X. Returns ------- mask """ mask = np.asarray(mask) if np.issubdtype(mask.dtype, np.signedinteger): return mask if hasattr(X, "toarray"): ind = np.arange(mask.shape[0]) mask = ind[mask] return mask def axis0_safe_slice(X, mask, len_mask): """ This mask is safer than safe_mask since it returns an empty array, when a sparse matrix is sliced with a boolean mask with all False, instead of raising an unhelpful error in older versions of SciPy. See: https://github.com/scipy/scipy/issues/5361 Also note that we can avoid doing the dot product by checking if the len_mask is not zero in _huber_loss_and_gradient but this is not going to be the bottleneck, since the number of outliers and non_outliers are typically non-zero and it makes the code tougher to follow. """ if len_mask != 0: return X[safe_mask(X, mask), :] return np.zeros(shape=(0, X.shape[1])) def safe_indexing(X, indices): """Return items or rows from X using indices. Allows simple indexing of lists or arrays. Parameters ---------- X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series. Data from which to sample rows or items. indices : array-like of int Indices according to which X will be subsampled. Returns ------- subset Subset of X on first axis Notes ----- CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are not supported. """ if hasattr(X, "iloc"): # Work-around for indexing with read-only indices in pandas indices = indices if indices.flags.writeable else indices.copy() # Pandas Dataframes and Series try: return X.iloc[indices] except ValueError: # Cython typed memoryviews internally used in pandas do not support # readonly buffers. warnings.warn("Copying input dataframe for slicing.", DataConversionWarning) return X.copy().iloc[indices] elif hasattr(X, "shape"): if hasattr(X, 'take') and (hasattr(indices, 'dtype') and indices.dtype.kind == 'i'): # This is often substantially faster than X[indices] return X.take(indices, axis=0) else: return X[indices] else: return [X[idx] for idx in indices] def resample(*arrays, **options): """Resample arrays or sparse matrices in a consistent way The default strategy implements one step of the bootstrapping procedure. Parameters ---------- *arrays : sequence of indexable data-structures Indexable data-structures can be arrays, lists, dataframes or scipy sparse matrices with consistent first dimension. replace : boolean, True by default Implements resampling with replacement. If False, this will implement (sliced) random permutations. n_samples : int, None by default Number of samples to generate. If left to None this is automatically set to the first dimension of the arrays. If replace is False it should not be larger than the length of arrays. random_state : int, RandomState instance or None, optional (default=None) The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- resampled_arrays : sequence of indexable data-structures Sequence of resampled views of the collections. The original arrays are not impacted. Examples -------- It is possible to mix sparse and dense arrays in the same run:: >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]]) >>> y = np.array([0, 1, 2]) >>> from scipy.sparse import coo_matrix >>> X_sparse = coo_matrix(X) >>> from sklearn.utils import resample >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0) >>> X array([[ 1., 0.], [ 2., 1.], [ 1., 0.]]) >>> X_sparse # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE <3x2 sparse matrix of type '<... 'numpy.float64'>' with 4 stored elements in Compressed Sparse Row format> >>> X_sparse.toarray() array([[ 1., 0.], [ 2., 1.], [ 1., 0.]]) >>> y array([0, 1, 0]) >>> resample(y, n_samples=2, random_state=0) array([0, 1]) See also -------- :func:`sklearn.utils.shuffle` """ random_state = check_random_state(options.pop('random_state', None)) replace = options.pop('replace', True) max_n_samples = options.pop('n_samples', None) if options: raise ValueError("Unexpected kw arguments: %r" % options.keys()) if len(arrays) == 0: return None first = arrays[0] n_samples = first.shape[0] if hasattr(first, 'shape') else len(first) if max_n_samples is None: max_n_samples = n_samples elif (max_n_samples > n_samples) and (not replace): raise ValueError("Cannot sample %d out of arrays with dim %d " "when replace is False" % (max_n_samples, n_samples)) check_consistent_length(*arrays) if replace: indices = random_state.randint(0, n_samples, size=(max_n_samples,)) else: indices = np.arange(n_samples) random_state.shuffle(indices) indices = indices[:max_n_samples] # convert sparse matrices to CSR for row-based indexing arrays = [a.tocsr() if issparse(a) else a for a in arrays] resampled_arrays = [safe_indexing(a, indices) for a in arrays] if len(resampled_arrays) == 1: # syntactic sugar for the unit argument case return resampled_arrays[0] else: return resampled_arrays def shuffle(*arrays, **options): """Shuffle arrays or sparse matrices in a consistent way This is a convenience alias to ``resample(*arrays, replace=False)`` to do random permutations of the collections. Parameters ---------- *arrays : sequence of indexable data-structures Indexable data-structures can be arrays, lists, dataframes or scipy sparse matrices with consistent first dimension. random_state : int, RandomState instance or None, optional (default=None) The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. n_samples : int, None by default Number of samples to generate. If left to None this is automatically set to the first dimension of the arrays. Returns ------- shuffled_arrays : sequence of indexable data-structures Sequence of shuffled views of the collections. The original arrays are not impacted. Examples -------- It is possible to mix sparse and dense arrays in the same run:: >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]]) >>> y = np.array([0, 1, 2]) >>> from scipy.sparse import coo_matrix >>> X_sparse = coo_matrix(X) >>> from sklearn.utils import shuffle >>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0) >>> X array([[ 0., 0.], [ 2., 1.], [ 1., 0.]]) >>> X_sparse # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE <3x2 sparse matrix of type '<... 'numpy.float64'>' with 3 stored elements in Compressed Sparse Row format> >>> X_sparse.toarray() array([[ 0., 0.], [ 2., 1.], [ 1., 0.]]) >>> y array([2, 1, 0]) >>> shuffle(y, n_samples=2, random_state=0) array([0, 1]) See also -------- :func:`sklearn.utils.resample` """ options['replace'] = False return resample(*arrays, **options) def safe_sqr(X, copy=True): """Element wise squaring of array-likes and sparse matrices. Parameters ---------- X : array like, matrix, sparse matrix copy : boolean, optional, default True Whether to create a copy of X and operate on it or to perform inplace computation (default behaviour). Returns ------- X ** 2 : element wise square """ X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], ensure_2d=False) if issparse(X): if copy: X = X.copy() X.data **= 2 else: if copy: X = X ** 2 else: X **= 2 return X def gen_batches(n, batch_size): """Generator to create slices containing batch_size elements, from 0 to n. The last slice may contain less than batch_size elements, when batch_size does not divide n. Examples -------- >>> from sklearn.utils import gen_batches >>> list(gen_batches(7, 3)) [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)] >>> list(gen_batches(6, 3)) [slice(0, 3, None), slice(3, 6, None)] >>> list(gen_batches(2, 3)) [slice(0, 2, None)] """ start = 0 for _ in range(int(n // batch_size)): end = start + batch_size yield slice(start, end) start = end if start < n: yield slice(start, n) def gen_even_slices(n, n_packs, n_samples=None): """Generator to create n_packs slices going up to n. Pass n_samples when the slices are to be used for sparse matrix indexing; slicing off-the-end raises an exception, while it works for NumPy arrays. Examples -------- >>> from sklearn.utils import gen_even_slices >>> list(gen_even_slices(10, 1)) [slice(0, 10, None)] >>> list(gen_even_slices(10, 10)) #doctest: +ELLIPSIS [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)] >>> list(gen_even_slices(10, 5)) #doctest: +ELLIPSIS [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)] >>> list(gen_even_slices(10, 3)) [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)] """ start = 0 if n_packs < 1: raise ValueError("gen_even_slices got n_packs=%s, must be >=1" % n_packs) for pack_num in range(n_packs): this_n = n // n_packs if pack_num < n % n_packs: this_n += 1 if this_n > 0: end = start + this_n if n_samples is not None: end = min(n_samples, end) yield slice(start, end, None) start = end def _get_n_jobs(n_jobs): """Get number of jobs for the computation. This function reimplements the logic of joblib to determine the actual number of jobs depending on the cpu count. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Parameters ---------- n_jobs : int Number of jobs stated in joblib convention. Returns ------- n_jobs : int The actual number of jobs as positive integer. Examples -------- >>> from sklearn.utils import _get_n_jobs >>> _get_n_jobs(4) 4 >>> jobs = _get_n_jobs(-2) >>> assert jobs == max(cpu_count() - 1, 1) >>> _get_n_jobs(0) Traceback (most recent call last): ... ValueError: Parameter n_jobs == 0 has no meaning. """ if n_jobs < 0: return max(cpu_count() + 1 + n_jobs, 1) elif n_jobs == 0: raise ValueError('Parameter n_jobs == 0 has no meaning.') else: return n_jobs def tosequence(x): """Cast iterable x to a Sequence, avoiding a copy if possible. Parameters ---------- x : iterable """ if isinstance(x, np.ndarray): return np.asarray(x) elif isinstance(x, Sequence): return x else: return list(x) def indices_to_mask(indices, mask_length): """Convert list of indices to boolean mask. Parameters ---------- indices : list-like List of integers treated as indices. mask_length : int Length of boolean mask to be generated. Returns ------- mask : 1d boolean nd-array Boolean array that is True where indices are present, else False. """ if mask_length <= np.max(indices): raise ValueError("mask_length must be greater than max(indices)") mask = np.zeros(mask_length, dtype=np.bool) mask[indices] = True return mask