alpcentaur
/
basabuuka_prototyp


								"""Transformers for missing value imputation"""

								# Authors: Nicolas Tresegnie <nicolas.tresegnie@gmail.com>

								#          Sergey Feldman <sergeyfeldman@gmail.com>

								# License: BSD 3 clause


								import warnings

								import numbers


								import numpy as np

								import numpy.ma as ma

								from scipy import sparse

								from scipy import stats


								from .base import BaseEstimator, TransformerMixin

								from .utils import check_array

								from .utils.sparsefuncs import _get_median

								from .utils.validation import check_is_fitted

								from .utils.validation import FLOAT_DTYPES

								from .utils.fixes import _object_dtype_isnan

								from .utils import is_scalar_nan


								from .externals import six


								zip = six.moves.zip

								map = six.moves.map


								__all__ = [

								    'MissingIndicator',

								    'SimpleImputer',

								]


								def _check_inputs_dtype(X, missing_values):

								    if (X.dtype.kind in ("f", "i", "u") and

								            not isinstance(missing_values, numbers.Real)):

								        raise ValueError("'X' and 'missing_values' types are expected to be"

								                         " both numerical. Got X.dtype={} and "

								                         " type(missing_values)={}."

								                         .format(X.dtype, type(missing_values)))


								def _get_mask(X, value_to_mask):

								    """Compute the boolean mask X == missing_values."""

								    if is_scalar_nan(value_to_mask):

								        if X.dtype.kind == "f":

								            return np.isnan(X)

								        elif X.dtype.kind in ("i", "u"):

								            # can't have NaNs in integer array.

								            return np.zeros(X.shape, dtype=bool)

								        else:

								            # np.isnan does not work on object dtypes.

								            return _object_dtype_isnan(X)

								    else:

								        # X == value_to_mask with object dytpes does not always perform

								        # element-wise for old versions of numpy

								        return np.equal(X, value_to_mask)


								def _most_frequent(array, extra_value, n_repeat):

								    """Compute the most frequent value in a 1d array extended with

								       [extra_value] * n_repeat, where extra_value is assumed to be not part

								       of the array."""

								    # Compute the most frequent value in array only

								    if array.size > 0:

								        with warnings.catch_warnings():

								            # stats.mode raises a warning when input array contains objects due

								            # to incapacity to detect NaNs. Irrelevant here since input array

								            # has already been NaN-masked.

								            warnings.simplefilter("ignore", RuntimeWarning)

								            mode = stats.mode(array)


								        most_frequent_value = mode[0][0]

								        most_frequent_count = mode[1][0]

								    else:

								        most_frequent_value = 0

								        most_frequent_count = 0


								    # Compare to array + [extra_value] * n_repeat

								    if most_frequent_count == 0 and n_repeat == 0:

								        return np.nan

								    elif most_frequent_count < n_repeat:

								        return extra_value

								    elif most_frequent_count > n_repeat:

								        return most_frequent_value

								    elif most_frequent_count == n_repeat:

								        # Ties the breaks. Copy the behaviour of scipy.stats.mode

								        if most_frequent_value < extra_value:

								            return most_frequent_value

								        else:

								            return extra_value


								class SimpleImputer(BaseEstimator, TransformerMixin):

								    """Imputation transformer for completing missing values.


								    Read more in the :ref:`User Guide <impute>`.


								    Parameters

								    ----------

								    missing_values : number, string, np.nan (default) or None

								        The placeholder for the missing values. All occurrences of

								        `missing_values` will be imputed.


								    strategy : string, optional (default="mean")

								        The imputation strategy.


								        - If "mean", then replace missing values using the mean along

								          each column. Can only be used with numeric data.

								        - If "median", then replace missing values using the median along

								          each column. Can only be used with numeric data.

								        - If "most_frequent", then replace missing using the most frequent

								          value along each column. Can be used with strings or numeric data.

								        - If "constant", then replace missing values with fill_value. Can be

								          used with strings or numeric data.


								        .. versionadded:: 0.20

								           strategy="constant" for fixed value imputation.


								    fill_value : string or numerical value, optional (default=None)

								        When strategy == "constant", fill_value is used to replace all

								        occurrences of missing_values.

								        If left to the default, fill_value will be 0 when imputing numerical

								        data and "missing_value" for strings or object data types.


								    verbose : integer, optional (default=0)

								        Controls the verbosity of the imputer.


								    copy : boolean, optional (default=True)

								        If True, a copy of X will be created. If False, imputation will

								        be done in-place whenever possible. Note that, in the following cases,

								        a new copy will always be made, even if `copy=False`:


								        - If X is not an array of floating values;

								        - If X is encoded as a CSR matrix.


								    Attributes

								    ----------

								    statistics_ : array of shape (n_features,)

								        The imputation fill value for each feature.


								    Examples

								    --------

								    >>> import numpy as np

								    >>> from sklearn.impute import SimpleImputer

								    >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

								    >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])

								    ... # doctest: +NORMALIZE_WHITESPACE

								    SimpleImputer(copy=True, fill_value=None, missing_values=nan,

								           strategy='mean', verbose=0)

								    >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]

								    >>> print(imp_mean.transform(X))

								    ... # doctest: +NORMALIZE_WHITESPACE

								    [[ 7.   2.   3. ]

								     [ 4.   3.5  6. ]

								     [10.   3.5  9. ]]


								    Notes

								    -----

								    Columns which only contained missing values at `fit` are discarded upon

								    `transform` if strategy is not "constant".


								    """

								    def __init__(self, missing_values=np.nan, strategy="mean",

								                 fill_value=None, verbose=0, copy=True):

								        self.missing_values = missing_values

								        self.strategy = strategy

								        self.fill_value = fill_value

								        self.verbose = verbose

								        self.copy = copy


								    def _validate_input(self, X):

								        allowed_strategies = ["mean", "median", "most_frequent", "constant"]

								        if self.strategy not in allowed_strategies:

								            raise ValueError("Can only use these strategies: {0} "

								                             " got strategy={1}".format(allowed_strategies,

								                                                        self.strategy))


								        if self.strategy in ("most_frequent", "constant"):

								            dtype = None

								        else:

								            dtype = FLOAT_DTYPES


								        if not is_scalar_nan(self.missing_values):

								            force_all_finite = True

								        else:

								            force_all_finite = "allow-nan"


								        try:

								            X = check_array(X, accept_sparse='csc', dtype=dtype,

								                            force_all_finite=force_all_finite, copy=self.copy)

								        except ValueError as ve:

								            if "could not convert" in str(ve):

								                raise ValueError("Cannot use {0} strategy with non-numeric "

								                                 "data. Received datatype :{1}."

								                                 "".format(self.strategy, X.dtype.kind))

								            else:

								                raise ve


								        _check_inputs_dtype(X, self.missing_values)

								        if X.dtype.kind not in ("i", "u", "f", "O"):

								            raise ValueError("SimpleImputer does not support data with dtype "

								                             "{0}. Please provide either a numeric array (with"

								                             " a floating point or integer dtype) or "

								                             "categorical data represented either as an array "

								                             "with integer dtype or an array of string values "

								                             "with an object dtype.".format(X.dtype))


								        return X


								    def fit(self, X, y=None):

								        """Fit the imputer on X.


								        Parameters

								        ----------

								        X : {array-like, sparse matrix}, shape (n_samples, n_features)

								            Input data, where ``n_samples`` is the number of samples and

								            ``n_features`` is the number of features.


								        Returns

								        -------

								        self : SimpleImputer

								        """

								        X = self._validate_input(X)


								        # default fill_value is 0 for numerical input and "missing_value"

								        # otherwise

								        if self.fill_value is None:

								            if X.dtype.kind in ("i", "u", "f"):

								                fill_value = 0

								            else:

								                fill_value = "missing_value"

								        else:

								            fill_value = self.fill_value


								        # fill_value should be numerical in case of numerical input

								        if (self.strategy == "constant" and

								                X.dtype.kind in ("i", "u", "f") and

								                not isinstance(fill_value, numbers.Real)):

								            raise ValueError("'fill_value'={0} is invalid. Expected a "

								                             "numerical value when imputing numerical "

								                             "data".format(fill_value))


								        if sparse.issparse(X):

								            # missing_values = 0 not allowed with sparse data as it would

								            # force densification

								            if self.missing_values == 0:

								                raise ValueError("Imputation not possible when missing_values "

								                                 "== 0 and input is sparse. Provide a dense "

								                                 "array instead.")

								            else:

								                self.statistics_ = self._sparse_fit(X,

								                                                    self.strategy,

								                                                    self.missing_values,

								                                                    fill_value)

								        else:

								            self.statistics_ = self._dense_fit(X,

								                                               self.strategy,

								                                               self.missing_values,

								                                               fill_value)


								        return self


								    def _sparse_fit(self, X, strategy, missing_values, fill_value):

								        """Fit the transformer on sparse data."""

								        mask_data = _get_mask(X.data, missing_values)

								        n_implicit_zeros = X.shape[0] - np.diff(X.indptr)


								        statistics = np.empty(X.shape[1])


								        if strategy == "constant":

								            # for constant strategy, self.statistcs_ is used to store

								            # fill_value in each column

								            statistics.fill(fill_value)


								        else:

								            for i in range(X.shape[1]):

								                column = X.data[X.indptr[i]:X.indptr[i + 1]]

								                mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]]

								                column = column[~mask_column]


								                # combine explicit and implicit zeros

								                mask_zeros = _get_mask(column, 0)

								                column = column[~mask_zeros]

								                n_explicit_zeros = mask_zeros.sum()

								                n_zeros = n_implicit_zeros[i] + n_explicit_zeros


								                if strategy == "mean":

								                    s = column.size + n_zeros

								                    statistics[i] = np.nan if s == 0 else column.sum() / s


								                elif strategy == "median":

								                    statistics[i] = _get_median(column,

								                                                n_zeros)


								                elif strategy == "most_frequent":

								                    statistics[i] = _most_frequent(column,

								                                                   0,

								                                                   n_zeros)

								        return statistics


								    def _dense_fit(self, X, strategy, missing_values, fill_value):

								        """Fit the transformer on dense data."""

								        mask = _get_mask(X, missing_values)

								        masked_X = ma.masked_array(X, mask=mask)


								        # Mean

								        if strategy == "mean":

								            mean_masked = np.ma.mean(masked_X, axis=0)

								            # Avoid the warning "Warning: converting a masked element to nan."

								            mean = np.ma.getdata(mean_masked)

								            mean[np.ma.getmask(mean_masked)] = np.nan


								            return mean


								        # Median

								        elif strategy == "median":

								            median_masked = np.ma.median(masked_X, axis=0)

								            # Avoid the warning "Warning: converting a masked element to nan."

								            median = np.ma.getdata(median_masked)

								            median[np.ma.getmaskarray(median_masked)] = np.nan


								            return median


								        # Most frequent

								        elif strategy == "most_frequent":

								            # scipy.stats.mstats.mode cannot be used because it will no work

								            # properly if the first element is masked and if its frequency

								            # is equal to the frequency of the most frequent valid element

								            # See https://github.com/scipy/scipy/issues/2636


								            # To be able access the elements by columns

								            X = X.transpose()

								            mask = mask.transpose()


								            if X.dtype.kind == "O":

								                most_frequent = np.empty(X.shape[0], dtype=object)

								            else:

								                most_frequent = np.empty(X.shape[0])


								            for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):

								                row_mask = np.logical_not(row_mask).astype(np.bool)

								                row = row[row_mask]

								                most_frequent[i] = _most_frequent(row, np.nan, 0)


								            return most_frequent


								        # Constant

								        elif strategy == "constant":

								            # for constant strategy, self.statistcs_ is used to store

								            # fill_value in each column

								            return np.full(X.shape[1], fill_value, dtype=X.dtype)


								    def transform(self, X):

								        """Impute all missing values in X.


								        Parameters

								        ----------

								        X : {array-like, sparse matrix}, shape (n_samples, n_features)

								            The input data to complete.

								        """

								        check_is_fitted(self, 'statistics_')


								        X = self._validate_input(X)


								        statistics = self.statistics_


								        if X.shape[1] != statistics.shape[0]:

								            raise ValueError("X has %d features per sample, expected %d"

								                             % (X.shape[1], self.statistics_.shape[0]))


								        # Delete the invalid columns if strategy is not constant

								        if self.strategy == "constant":

								            valid_statistics = statistics

								        else:

								            # same as np.isnan but also works for object dtypes

								            invalid_mask = _get_mask(statistics, np.nan)

								            valid_mask = np.logical_not(invalid_mask)

								            valid_statistics = statistics[valid_mask]

								            valid_statistics_indexes = np.flatnonzero(valid_mask)


								            if invalid_mask.any():

								                missing = np.arange(X.shape[1])[invalid_mask]

								                if self.verbose:

								                    warnings.warn("Deleting features without "

								                                  "observed values: %s" % missing)

								                X = X[:, valid_statistics_indexes]


								        # Do actual imputation

								        if sparse.issparse(X):

								            if self.missing_values == 0:

								                raise ValueError("Imputation not possible when missing_values "

								                                 "== 0 and input is sparse. Provide a dense "

								                                 "array instead.")

								            else:

								                mask = _get_mask(X.data, self.missing_values)

								                indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),

								                                    np.diff(X.indptr))[mask]


								                X.data[mask] = valid_statistics[indexes].astype(X.dtype,

								                                                                copy=False)

								        else:

								            mask = _get_mask(X, self.missing_values)

								            n_missing = np.sum(mask, axis=0)

								            values = np.repeat(valid_statistics, n_missing)

								            coordinates = np.where(mask.transpose())[::-1]


								            X[coordinates] = values


								        return X


								class MissingIndicator(BaseEstimator, TransformerMixin):

								    """Binary indicators for missing values.


								    Parameters

								    ----------

								    missing_values : number, string, np.nan (default) or None

								        The placeholder for the missing values. All occurrences of

								        `missing_values` will be imputed.


								    features : str, optional

								        Whether the imputer mask should represent all or a subset of

								        features.


								        - If "missing-only" (default), the imputer mask will only represent

								          features containing missing values during fit time.

								        - If "all", the imputer mask will represent all features.


								    sparse : boolean or "auto", optional

								        Whether the imputer mask format should be sparse or dense.


								        - If "auto" (default), the imputer mask will be of same type as

								          input.

								        - If True, the imputer mask will be a sparse matrix.

								        - If False, the imputer mask will be a numpy array.


								    error_on_new : boolean, optional

								        If True (default), transform will raise an error when there are

								        features with missing values in transform that have no missing values

								        in fit This is applicable only when ``features="missing-only"``.


								    Attributes

								    ----------

								    features_ : ndarray, shape (n_missing_features,) or (n_features,)

								        The features indices which will be returned when calling ``transform``.

								        They are computed during ``fit``. For ``features='all'``, it is

								        to ``range(n_features)``.


								    Examples

								    --------

								    >>> import numpy as np

								    >>> from sklearn.impute import MissingIndicator

								    >>> X1 = np.array([[np.nan, 1, 3],

								    ...                [4, 0, np.nan],

								    ...                [8, 1, 0]])

								    >>> X2 = np.array([[5, 1, np.nan],

								    ...                [np.nan, 2, 3],

								    ...                [2, 4, 0]])

								    >>> indicator = MissingIndicator()

								    >>> indicator.fit(X1)

								    MissingIndicator(error_on_new=True, features='missing-only',

								             missing_values=nan, sparse='auto')

								    >>> X2_tr = indicator.transform(X2)

								    >>> X2_tr

								    array([[False,  True],

								           [ True, False],

								           [False, False]])


								    """


								    def __init__(self, missing_values=np.nan, features="missing-only",

								                 sparse="auto", error_on_new=True):

								        self.missing_values = missing_values

								        self.features = features

								        self.sparse = sparse

								        self.error_on_new = error_on_new


								    def _get_missing_features_info(self, X):

								        """Compute the imputer mask and the indices of the features

								        containing missing values.


								        Parameters

								        ----------

								        X : {ndarray or sparse matrix}, shape (n_samples, n_features)

								            The input data with missing values. Note that ``X`` has been

								            checked in ``fit`` and ``transform`` before to call this function.


								        Returns

								        -------

								        imputer_mask : {ndarray or sparse matrix}, shape \

								(n_samples, n_features) or (n_samples, n_features_with_missing)

								            The imputer mask of the original data.


								        features_with_missing : ndarray, shape (n_features_with_missing)

								            The features containing missing values.


								        """

								        if sparse.issparse(X) and self.missing_values != 0:

								            mask = _get_mask(X.data, self.missing_values)


								            # The imputer mask will be constructed with the same sparse format

								            # as X.

								            sparse_constructor = (sparse.csr_matrix if X.format == 'csr'

								                                  else sparse.csc_matrix)

								            imputer_mask = sparse_constructor(

								                (mask, X.indices.copy(), X.indptr.copy()),

								                shape=X.shape, dtype=bool)


								            missing_values_mask = imputer_mask.copy()

								            missing_values_mask.eliminate_zeros()

								            features_with_missing = (

								                np.flatnonzero(np.diff(missing_values_mask.indptr))

								                if missing_values_mask.format == 'csc'

								                else np.unique(missing_values_mask.indices))


								            if self.sparse is False:

								                imputer_mask = imputer_mask.toarray()

								            elif imputer_mask.format == 'csr':

								                imputer_mask = imputer_mask.tocsc()

								        else:

								            if sparse.issparse(X):

								                # case of sparse matrix with 0 as missing values. Implicit and

								                # explicit zeros are considered as missing values.

								                X = X.toarray()

								            imputer_mask = _get_mask(X, self.missing_values)

								            features_with_missing = np.flatnonzero(imputer_mask.sum(axis=0))


								            if self.sparse is True:

								                imputer_mask = sparse.csc_matrix(imputer_mask)


								        return imputer_mask, features_with_missing


								    def fit(self, X, y=None):

								        """Fit the transformer on X.


								        Parameters

								        ----------

								        X : {array-like, sparse matrix}, shape (n_samples, n_features)

								            Input data, where ``n_samples`` is the number of samples and

								            ``n_features`` is the number of features.


								        Returns

								        -------

								        self : object

								            Returns self.

								        """

								        if not is_scalar_nan(self.missing_values):

								            force_all_finite = True

								        else:

								            force_all_finite = "allow-nan"

								        X = check_array(X, accept_sparse=('csc', 'csr'),

								                        force_all_finite=force_all_finite)

								        _check_inputs_dtype(X, self.missing_values)


								        self._n_features = X.shape[1]


								        if self.features not in ('missing-only', 'all'):

								            raise ValueError("'features' has to be either 'missing-only' or "

								                             "'all'. Got {} instead.".format(self.features))


								        if not ((isinstance(self.sparse, six.string_types) and

								                self.sparse == "auto") or isinstance(self.sparse, bool)):

								            raise ValueError("'sparse' has to be a boolean or 'auto'. "

								                             "Got {!r} instead.".format(self.sparse))


								        self.features_ = (self._get_missing_features_info(X)[1]

								                          if self.features == 'missing-only'

								                          else np.arange(self._n_features))


								        return self


								    def transform(self, X):

								        """Generate missing values indicator for X.


								        Parameters

								        ----------

								        X : {array-like, sparse matrix}, shape (n_samples, n_features)

								            The input data to complete.


								        Returns

								        -------

								        Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)

								            The missing indicator for input data. The data type of ``Xt``

								            will be boolean.


								        """

								        check_is_fitted(self, "features_")


								        if not is_scalar_nan(self.missing_values):

								            force_all_finite = True

								        else:

								            force_all_finite = "allow-nan"

								        X = check_array(X, accept_sparse=('csc', 'csr'),

								                        force_all_finite=force_all_finite)

								        _check_inputs_dtype(X, self.missing_values)


								        if X.shape[1] != self._n_features:

								            raise ValueError("X has a different number of features "

								                             "than during fitting.")


								        imputer_mask, features = self._get_missing_features_info(X)


								        if self.features == "missing-only":

								            features_diff_fit_trans = np.setdiff1d(features, self.features_)

								            if (self.error_on_new and features_diff_fit_trans.size > 0):

								                raise ValueError("The features {} have missing values "

								                                 "in transform but have no missing values "

								                                 "in fit.".format(features_diff_fit_trans))


								            if (self.features_.size > 0 and

								                    self.features_.size < self._n_features):

								                imputer_mask = imputer_mask[:, self.features_]


								        return imputer_mask


								    def fit_transform(self, X, y=None):

								        """Generate missing values indicator for X.


								        Parameters

								        ----------

								        X : {array-like, sparse matrix}, shape (n_samples, n_features)

								            The input data to complete.


								        Returns

								        -------

								        Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)

								            The missing indicator for input data. The data type of ``Xt``

								            will be boolean.


								        """

								        return self.fit(X, y).transform(X)