laywerrobot/lib/python3.6/site-packages/sklearn/preprocessing/imputation.py

# Authors: Nicolas Tresegnie <nicolas.tresegnie@gmail.com>
# License: BSD 3 clause

import warnings

import numpy as np
import numpy.ma as ma
from scipy import sparse
from scipy import stats

from ..base import BaseEstimator, TransformerMixin
from ..utils import check_array
from ..utils.sparsefuncs import _get_median
from ..utils.validation import check_is_fitted
from ..utils.validation import FLOAT_DTYPES

from ..externals import six

zip = six.moves.zip
map = six.moves.map

__all__ = [
    'Imputer',
]


def _get_mask(X, value_to_mask):
    """Compute the boolean mask X == missing_values."""
    if value_to_mask == "NaN" or np.isnan(value_to_mask):
        return np.isnan(X)
    else:
        return X == value_to_mask


def _most_frequent(array, extra_value, n_repeat):
    """Compute the most frequent value in a 1d array extended with
       [extra_value] * n_repeat, where extra_value is assumed to be not part
       of the array."""
    # Compute the most frequent value in array only
    if array.size > 0:
        mode = stats.mode(array)
        most_frequent_value = mode[0][0]
        most_frequent_count = mode[1][0]
    else:
        most_frequent_value = 0
        most_frequent_count = 0

    # Compare to array + [extra_value] * n_repeat
    if most_frequent_count == 0 and n_repeat == 0:
        return np.nan
    elif most_frequent_count < n_repeat:
        return extra_value
    elif most_frequent_count > n_repeat:
        return most_frequent_value
    elif most_frequent_count == n_repeat:
        # Ties the breaks. Copy the behaviour of scipy.stats.mode
        if most_frequent_value < extra_value:
            return most_frequent_value
        else:
            return extra_value


class Imputer(BaseEstimator, TransformerMixin):
    """Imputation transformer for completing missing values.

    Read more in the :ref:`User Guide <imputation>`.

    Parameters
    ----------
    missing_values : integer or "NaN", optional (default="NaN")
        The placeholder for the missing values. All occurrences of
        `missing_values` will be imputed. For missing values encoded as np.nan,
        use the string value "NaN".

    strategy : string, optional (default="mean")
        The imputation strategy.

        - If "mean", then replace missing values using the mean along
          the axis.
        - If "median", then replace missing values using the median along
          the axis.
        - If "most_frequent", then replace missing using the most frequent
          value along the axis.

    axis : integer, optional (default=0)
        The axis along which to impute.

        - If `axis=0`, then impute along columns.
        - If `axis=1`, then impute along rows.

    verbose : integer, optional (default=0)
        Controls the verbosity of the imputer.

    copy : boolean, optional (default=True)
        If True, a copy of X will be created. If False, imputation will
        be done in-place whenever possible. Note that, in the following cases,
        a new copy will always be made, even if `copy=False`:

        - If X is not an array of floating values;
        - If X is sparse and `missing_values=0`;
        - If `axis=0` and X is encoded as a CSR matrix;
        - If `axis=1` and X is encoded as a CSC matrix.

    Attributes
    ----------
    statistics_ : array of shape (n_features,)
        The imputation fill value for each feature if axis == 0.

    Notes
    -----
    - When ``axis=0``, columns which only contained missing values at `fit`
      are discarded upon `transform`.
    - When ``axis=1``, an exception is raised if there are rows for which it is
      not possible to fill in the missing values (e.g., because they only
      contain missing values).
    """
    def __init__(self, missing_values="NaN", strategy="mean",
                 axis=0, verbose=0, copy=True):
        self.missing_values = missing_values
        self.strategy = strategy
        self.axis = axis
        self.verbose = verbose
        self.copy = copy

    def fit(self, X, y=None):
        """Fit the imputer on X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape (n_samples, n_features)
            Input data, where ``n_samples`` is the number of samples and
            ``n_features`` is the number of features.

        Returns
        -------
        self : Imputer
            Returns self.
        """
        # Check parameters
        allowed_strategies = ["mean", "median", "most_frequent"]
        if self.strategy not in allowed_strategies:
            raise ValueError("Can only use these strategies: {0} "
                             " got strategy={1}".format(allowed_strategies,
                                                        self.strategy))

        if self.axis not in [0, 1]:
            raise ValueError("Can only impute missing values on axis 0 and 1, "
                             " got axis={0}".format(self.axis))

        # Since two different arrays can be provided in fit(X) and
        # transform(X), the imputation data will be computed in transform()
        # when the imputation is done per sample (i.e., when axis=1).
        if self.axis == 0:
            X = check_array(X, accept_sparse='csc', dtype=np.float64,
                            force_all_finite=False)

            if sparse.issparse(X):
                self.statistics_ = self._sparse_fit(X,
                                                    self.strategy,
                                                    self.missing_values,
                                                    self.axis)
            else:
                self.statistics_ = self._dense_fit(X,
                                                   self.strategy,
                                                   self.missing_values,
                                                   self.axis)

        return self

    def _sparse_fit(self, X, strategy, missing_values, axis):
        """Fit the transformer on sparse data."""
        # Imputation is done "by column", so if we want to do it
        # by row we only need to convert the matrix to csr format.
        if axis == 1:
            X = X.tocsr()
        else:
            X = X.tocsc()

        # Count the zeros
        if missing_values == 0:
            n_zeros_axis = np.zeros(X.shape[not axis], dtype=int)
        else:
            n_zeros_axis = X.shape[axis] - np.diff(X.indptr)

        # Mean
        if strategy == "mean":
            if missing_values != 0:
                n_non_missing = n_zeros_axis

                # Mask the missing elements
                mask_missing_values = _get_mask(X.data, missing_values)
                mask_valids = np.logical_not(mask_missing_values)

                # Sum only the valid elements
                new_data = X.data.copy()
                new_data[mask_missing_values] = 0
                X = sparse.csc_matrix((new_data, X.indices, X.indptr),
                                      copy=False)
                sums = X.sum(axis=0)

                # Count the elements != 0
                mask_non_zeros = sparse.csc_matrix(
                    (mask_valids.astype(np.float64),
                     X.indices,
                     X.indptr), copy=False)
                s = mask_non_zeros.sum(axis=0)
                n_non_missing = np.add(n_non_missing, s)

            else:
                sums = X.sum(axis=axis)
                n_non_missing = np.diff(X.indptr)

            # Ignore the error, columns with a np.nan statistics_
            # are not an error at this point. These columns will
            # be removed in transform
            with np.errstate(all="ignore"):
                return np.ravel(sums) / np.ravel(n_non_missing)

        # Median + Most frequent
        else:
            # Remove the missing values, for each column
            columns_all = np.hsplit(X.data, X.indptr[1:-1])
            mask_missing_values = _get_mask(X.data, missing_values)
            mask_valids = np.hsplit(np.logical_not(mask_missing_values),
                                    X.indptr[1:-1])

            # astype necessary for bug in numpy.hsplit before v1.9
            columns = [col[mask.astype(bool, copy=False)]
                       for col, mask in zip(columns_all, mask_valids)]

            # Median
            if strategy == "median":
                median = np.empty(len(columns))
                for i, column in enumerate(columns):
                    median[i] = _get_median(column, n_zeros_axis[i])

                return median

            # Most frequent
            elif strategy == "most_frequent":
                most_frequent = np.empty(len(columns))

                for i, column in enumerate(columns):
                    most_frequent[i] = _most_frequent(column,
                                                      0,
                                                      n_zeros_axis[i])

                return most_frequent

    def _dense_fit(self, X, strategy, missing_values, axis):
        """Fit the transformer on dense data."""
        X = check_array(X, force_all_finite=False)
        mask = _get_mask(X, missing_values)
        masked_X = ma.masked_array(X, mask=mask)

        # Mean
        if strategy == "mean":
            mean_masked = np.ma.mean(masked_X, axis=axis)
            # Avoid the warning "Warning: converting a masked element to nan."
            mean = np.ma.getdata(mean_masked)
            mean[np.ma.getmask(mean_masked)] = np.nan

            return mean

        # Median
        elif strategy == "median":
            if tuple(int(v) for v in np.__version__.split('.')[:2]) < (1, 5):
                # In old versions of numpy, calling a median on an array
                # containing nans returns nan. This is different is
                # recent versions of numpy, which we want to mimic
                masked_X.mask = np.logical_or(masked_X.mask,
                                              np.isnan(X))
            median_masked = np.ma.median(masked_X, axis=axis)
            # Avoid the warning "Warning: converting a masked element to nan."
            median = np.ma.getdata(median_masked)
            median[np.ma.getmaskarray(median_masked)] = np.nan

            return median

        # Most frequent
        elif strategy == "most_frequent":
            # scipy.stats.mstats.mode cannot be used because it will no work
            # properly if the first element is masked and if its frequency
            # is equal to the frequency of the most frequent valid element
            # See https://github.com/scipy/scipy/issues/2636

            # To be able access the elements by columns
            if axis == 0:
                X = X.transpose()
                mask = mask.transpose()

            most_frequent = np.empty(X.shape[0])

            for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
                row_mask = np.logical_not(row_mask).astype(np.bool)
                row = row[row_mask]
                most_frequent[i] = _most_frequent(row, np.nan, 0)

            return most_frequent

    def transform(self, X):
        """Impute all missing values in X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            The input data to complete.
        """
        if self.axis == 0:
            check_is_fitted(self, 'statistics_')
            X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
                            force_all_finite=False, copy=self.copy)
            statistics = self.statistics_
            if X.shape[1] != statistics.shape[0]:
                raise ValueError("X has %d features per sample, expected %d"
                                 % (X.shape[1], self.statistics_.shape[0]))

        # Since two different arrays can be provided in fit(X) and
        # transform(X), the imputation data need to be recomputed
        # when the imputation is done per sample
        else:
            X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,
                            force_all_finite=False, copy=self.copy)

            if sparse.issparse(X):
                statistics = self._sparse_fit(X,
                                              self.strategy,
                                              self.missing_values,
                                              self.axis)

            else:
                statistics = self._dense_fit(X,
                                             self.strategy,
                                             self.missing_values,
                                             self.axis)

        # Delete the invalid rows/columns
        invalid_mask = np.isnan(statistics)
        valid_mask = np.logical_not(invalid_mask)
        valid_statistics = statistics[valid_mask]
        valid_statistics_indexes = np.where(valid_mask)[0]
        missing = np.arange(X.shape[not self.axis])[invalid_mask]

        if self.axis == 0 and invalid_mask.any():
            if self.verbose:
                warnings.warn("Deleting features without "
                              "observed values: %s" % missing)
            X = X[:, valid_statistics_indexes]
        elif self.axis == 1 and invalid_mask.any():
            raise ValueError("Some rows only contain "
                             "missing values: %s" % missing)

        # Do actual imputation
        if sparse.issparse(X) and self.missing_values != 0:
            mask = _get_mask(X.data, self.missing_values)
            indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),
                                np.diff(X.indptr))[mask]

            X.data[mask] = valid_statistics[indexes].astype(X.dtype,
                                                            copy=False)
        else:
            if sparse.issparse(X):
                X = X.toarray()

            mask = _get_mask(X, self.missing_values)
            n_missing = np.sum(mask, axis=self.axis)
            values = np.repeat(valid_statistics, n_missing)

            if self.axis == 0:
                coordinates = np.where(mask.transpose())[::-1]
            else:
                coordinates = mask

            X[coordinates] = values

        return X
first commit 2020-08-27 21:55:39 +02:00			`# Authors: Nicolas Tresegnie <nicolas.tresegnie@gmail.com>`
			`# License: BSD 3 clause`

			`import warnings`

			`import numpy as np`
			`import numpy.ma as ma`
			`from scipy import sparse`
			`from scipy import stats`

			`from ..base import BaseEstimator, TransformerMixin`
			`from ..utils import check_array`
			`from ..utils.sparsefuncs import _get_median`
			`from ..utils.validation import check_is_fitted`
			`from ..utils.validation import FLOAT_DTYPES`

			`from ..externals import six`

			`zip = six.moves.zip`
			`map = six.moves.map`

			`__all__ = [`
			`'Imputer',`
			`]`


			`def _get_mask(X, value_to_mask):`
			`"""Compute the boolean mask X == missing_values."""`
			`if value_to_mask == "NaN" or np.isnan(value_to_mask):`
			`return np.isnan(X)`
			`else:`
			`return X == value_to_mask`


			`def _most_frequent(array, extra_value, n_repeat):`
			`"""Compute the most frequent value in a 1d array extended with`
			`[extra_value] * n_repeat, where extra_value is assumed to be not part`
			`of the array."""`
			`# Compute the most frequent value in array only`
			`if array.size > 0:`
			`mode = stats.mode(array)`
			`most_frequent_value = mode[0][0]`
			`most_frequent_count = mode[1][0]`
			`else:`
			`most_frequent_value = 0`
			`most_frequent_count = 0`

			`# Compare to array + [extra_value] * n_repeat`
			`if most_frequent_count == 0 and n_repeat == 0:`
			`return np.nan`
			`elif most_frequent_count < n_repeat:`
			`return extra_value`
			`elif most_frequent_count > n_repeat:`
			`return most_frequent_value`
			`elif most_frequent_count == n_repeat:`
			`# Ties the breaks. Copy the behaviour of scipy.stats.mode`
			`if most_frequent_value < extra_value:`
			`return most_frequent_value`
			`else:`
			`return extra_value`


			`class Imputer(BaseEstimator, TransformerMixin):`
			`"""Imputation transformer for completing missing values.`

			Read more in the :ref:`User Guide <imputation>`.

			`Parameters`
			`----------`
			`missing_values : integer or "NaN", optional (default="NaN")`
			`The placeholder for the missing values. All occurrences of`
			`missing_values` will be imputed. For missing values encoded as np.nan,
			`use the string value "NaN".`

			`strategy : string, optional (default="mean")`
			`The imputation strategy.`

			`- If "mean", then replace missing values using the mean along`
			`the axis.`
			`- If "median", then replace missing values using the median along`
			`the axis.`
			`- If "most_frequent", then replace missing using the most frequent`
			`value along the axis.`

			`axis : integer, optional (default=0)`
			`The axis along which to impute.`

			- If `axis=0`, then impute along columns.
			- If `axis=1`, then impute along rows.

			`verbose : integer, optional (default=0)`
			`Controls the verbosity of the imputer.`

			`copy : boolean, optional (default=True)`
			`If True, a copy of X will be created. If False, imputation will`
			`be done in-place whenever possible. Note that, in the following cases,`
			a new copy will always be made, even if `copy=False`:

			`- If X is not an array of floating values;`
			- If X is sparse and `missing_values=0`;
			- If `axis=0` and X is encoded as a CSR matrix;
			- If `axis=1` and X is encoded as a CSC matrix.

			`Attributes`
			`----------`
			`statistics_ : array of shape (n_features,)`
			`The imputation fill value for each feature if axis == 0.`

			`Notes`
			`-----`
			- When ``axis=0``, columns which only contained missing values at `fit`
			are discarded upon `transform`.
			- When ``axis=1``, an exception is raised if there are rows for which it is
			`not possible to fill in the missing values (e.g., because they only`
			`contain missing values).`
			`"""`
			`def __init__(self, missing_values="NaN", strategy="mean",`
			`axis=0, verbose=0, copy=True):`
			`self.missing_values = missing_values`
			`self.strategy = strategy`
			`self.axis = axis`
			`self.verbose = verbose`
			`self.copy = copy`

			`def fit(self, X, y=None):`
			`"""Fit the imputer on X.`

			`Parameters`
			`----------`
			`X : {array-like, sparse matrix}, shape (n_samples, n_features)`
			Input data, where ``n_samples`` is the number of samples and
			``n_features`` is the number of features.

			`Returns`
			`-------`
			`self : Imputer`
			`Returns self.`
			`"""`
			`# Check parameters`
			`allowed_strategies = ["mean", "median", "most_frequent"]`
			`if self.strategy not in allowed_strategies:`
			`raise ValueError("Can only use these strategies: {0} "`
			`" got strategy={1}".format(allowed_strategies,`
			`self.strategy))`

			`if self.axis not in [0, 1]:`
			`raise ValueError("Can only impute missing values on axis 0 and 1, "`
			`" got axis={0}".format(self.axis))`

			`# Since two different arrays can be provided in fit(X) and`
			`# transform(X), the imputation data will be computed in transform()`
			`# when the imputation is done per sample (i.e., when axis=1).`
			`if self.axis == 0:`
			`X = check_array(X, accept_sparse='csc', dtype=np.float64,`
			`force_all_finite=False)`

			`if sparse.issparse(X):`
			`self.statistics_ = self._sparse_fit(X,`
			`self.strategy,`
			`self.missing_values,`
			`self.axis)`
			`else:`
			`self.statistics_ = self._dense_fit(X,`
			`self.strategy,`
			`self.missing_values,`
			`self.axis)`

			`return self`

			`def _sparse_fit(self, X, strategy, missing_values, axis):`
			`"""Fit the transformer on sparse data."""`
			`# Imputation is done "by column", so if we want to do it`
			`# by row we only need to convert the matrix to csr format.`
			`if axis == 1:`
			`X = X.tocsr()`
			`else:`
			`X = X.tocsc()`

			`# Count the zeros`
			`if missing_values == 0:`
			`n_zeros_axis = np.zeros(X.shape[not axis], dtype=int)`
			`else:`
			`n_zeros_axis = X.shape[axis] - np.diff(X.indptr)`

			`# Mean`
			`if strategy == "mean":`
			`if missing_values != 0:`
			`n_non_missing = n_zeros_axis`

			`# Mask the missing elements`
			`mask_missing_values = _get_mask(X.data, missing_values)`
			`mask_valids = np.logical_not(mask_missing_values)`

			`# Sum only the valid elements`
			`new_data = X.data.copy()`
			`new_data[mask_missing_values] = 0`
			`X = sparse.csc_matrix((new_data, X.indices, X.indptr),`
			`copy=False)`
			`sums = X.sum(axis=0)`

			`# Count the elements != 0`
			`mask_non_zeros = sparse.csc_matrix(`
			`(mask_valids.astype(np.float64),`
			`X.indices,`
			`X.indptr), copy=False)`
			`s = mask_non_zeros.sum(axis=0)`
			`n_non_missing = np.add(n_non_missing, s)`

			`else:`
			`sums = X.sum(axis=axis)`
			`n_non_missing = np.diff(X.indptr)`

			`# Ignore the error, columns with a np.nan statistics_`
			`# are not an error at this point. These columns will`
			`# be removed in transform`
			`with np.errstate(all="ignore"):`
			`return np.ravel(sums) / np.ravel(n_non_missing)`

			`# Median + Most frequent`
			`else:`
			`# Remove the missing values, for each column`
			`columns_all = np.hsplit(X.data, X.indptr[1:-1])`
			`mask_missing_values = _get_mask(X.data, missing_values)`
			`mask_valids = np.hsplit(np.logical_not(mask_missing_values),`
			`X.indptr[1:-1])`

			`# astype necessary for bug in numpy.hsplit before v1.9`
			`columns = [col[mask.astype(bool, copy=False)]`
			`for col, mask in zip(columns_all, mask_valids)]`

			`# Median`
			`if strategy == "median":`
			`median = np.empty(len(columns))`
			`for i, column in enumerate(columns):`
			`median[i] = _get_median(column, n_zeros_axis[i])`

			`return median`

			`# Most frequent`
			`elif strategy == "most_frequent":`
			`most_frequent = np.empty(len(columns))`

			`for i, column in enumerate(columns):`
			`most_frequent[i] = _most_frequent(column,`
			`0,`
			`n_zeros_axis[i])`

			`return most_frequent`

			`def _dense_fit(self, X, strategy, missing_values, axis):`
			`"""Fit the transformer on dense data."""`
			`X = check_array(X, force_all_finite=False)`
			`mask = _get_mask(X, missing_values)`
			`masked_X = ma.masked_array(X, mask=mask)`

			`# Mean`
			`if strategy == "mean":`
			`mean_masked = np.ma.mean(masked_X, axis=axis)`
			`# Avoid the warning "Warning: converting a masked element to nan."`
			`mean = np.ma.getdata(mean_masked)`
			`mean[np.ma.getmask(mean_masked)] = np.nan`

			`return mean`

			`# Median`
			`elif strategy == "median":`
			`if tuple(int(v) for v in np.__version__.split('.')[:2]) < (1, 5):`
			`# In old versions of numpy, calling a median on an array`
			`# containing nans returns nan. This is different is`
			`# recent versions of numpy, which we want to mimic`
			`masked_X.mask = np.logical_or(masked_X.mask,`
			`np.isnan(X))`
			`median_masked = np.ma.median(masked_X, axis=axis)`
			`# Avoid the warning "Warning: converting a masked element to nan."`
			`median = np.ma.getdata(median_masked)`
			`median[np.ma.getmaskarray(median_masked)] = np.nan`

			`return median`

			`# Most frequent`
			`elif strategy == "most_frequent":`
			`# scipy.stats.mstats.mode cannot be used because it will no work`
			`# properly if the first element is masked and if its frequency`
			`# is equal to the frequency of the most frequent valid element`
			`# See https://github.com/scipy/scipy/issues/2636`

			`# To be able access the elements by columns`
			`if axis == 0:`
			`X = X.transpose()`
			`mask = mask.transpose()`

			`most_frequent = np.empty(X.shape[0])`

			`for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):`
			`row_mask = np.logical_not(row_mask).astype(np.bool)`
			`row = row[row_mask]`
			`most_frequent[i] = _most_frequent(row, np.nan, 0)`

			`return most_frequent`

			`def transform(self, X):`
			`"""Impute all missing values in X.`

			`Parameters`
			`----------`
			`X : {array-like, sparse matrix}, shape = [n_samples, n_features]`
			`The input data to complete.`
			`"""`
			`if self.axis == 0:`
			`check_is_fitted(self, 'statistics_')`
			`X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,`
			`force_all_finite=False, copy=self.copy)`
			`statistics = self.statistics_`
			`if X.shape[1] != statistics.shape[0]:`
			`raise ValueError("X has %d features per sample, expected %d"`
			`% (X.shape[1], self.statistics_.shape[0]))`

			`# Since two different arrays can be provided in fit(X) and`
			`# transform(X), the imputation data need to be recomputed`
			`# when the imputation is done per sample`
			`else:`
			`X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,`
			`force_all_finite=False, copy=self.copy)`

			`if sparse.issparse(X):`
			`statistics = self._sparse_fit(X,`
			`self.strategy,`
			`self.missing_values,`
			`self.axis)`

			`else:`
			`statistics = self._dense_fit(X,`
			`self.strategy,`
			`self.missing_values,`
			`self.axis)`

			`# Delete the invalid rows/columns`
			`invalid_mask = np.isnan(statistics)`
			`valid_mask = np.logical_not(invalid_mask)`
			`valid_statistics = statistics[valid_mask]`
			`valid_statistics_indexes = np.where(valid_mask)[0]`
			`missing = np.arange(X.shape[not self.axis])[invalid_mask]`

			`if self.axis == 0 and invalid_mask.any():`
			`if self.verbose:`
			`warnings.warn("Deleting features without "`
			`"observed values: %s" % missing)`
			`X = X[:, valid_statistics_indexes]`
			`elif self.axis == 1 and invalid_mask.any():`
			`raise ValueError("Some rows only contain "`
			`"missing values: %s" % missing)`

			`# Do actual imputation`
			`if sparse.issparse(X) and self.missing_values != 0:`
			`mask = _get_mask(X.data, self.missing_values)`
			`indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),`
			`np.diff(X.indptr))[mask]`

			`X.data[mask] = valid_statistics[indexes].astype(X.dtype,`
			`copy=False)`
			`else:`
			`if sparse.issparse(X):`
			`X = X.toarray()`

			`mask = _get_mask(X, self.missing_values)`
			`n_missing = np.sum(mask, axis=self.axis)`
			`values = np.repeat(valid_statistics, n_missing)`

			`if self.axis == 0:`
			`coordinates = np.where(mask.transpose())[::-1]`
			`else:`
			`coordinates = mask`

			`X[coordinates] = values`

			`return X`