You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

631 lines
24 KiB

4 years ago
  1. """Transformers for missing value imputation"""
  2. # Authors: Nicolas Tresegnie <nicolas.tresegnie@gmail.com>
  3. # Sergey Feldman <sergeyfeldman@gmail.com>
  4. # License: BSD 3 clause
  5. import warnings
  6. import numbers
  7. import numpy as np
  8. import numpy.ma as ma
  9. from scipy import sparse
  10. from scipy import stats
  11. from .base import BaseEstimator, TransformerMixin
  12. from .utils import check_array
  13. from .utils.sparsefuncs import _get_median
  14. from .utils.validation import check_is_fitted
  15. from .utils.validation import FLOAT_DTYPES
  16. from .utils.fixes import _object_dtype_isnan
  17. from .utils import is_scalar_nan
  18. from .externals import six
  19. zip = six.moves.zip
  20. map = six.moves.map
  21. __all__ = [
  22. 'MissingIndicator',
  23. 'SimpleImputer',
  24. ]
  25. def _check_inputs_dtype(X, missing_values):
  26. if (X.dtype.kind in ("f", "i", "u") and
  27. not isinstance(missing_values, numbers.Real)):
  28. raise ValueError("'X' and 'missing_values' types are expected to be"
  29. " both numerical. Got X.dtype={} and "
  30. " type(missing_values)={}."
  31. .format(X.dtype, type(missing_values)))
  32. def _get_mask(X, value_to_mask):
  33. """Compute the boolean mask X == missing_values."""
  34. if is_scalar_nan(value_to_mask):
  35. if X.dtype.kind == "f":
  36. return np.isnan(X)
  37. elif X.dtype.kind in ("i", "u"):
  38. # can't have NaNs in integer array.
  39. return np.zeros(X.shape, dtype=bool)
  40. else:
  41. # np.isnan does not work on object dtypes.
  42. return _object_dtype_isnan(X)
  43. else:
  44. # X == value_to_mask with object dytpes does not always perform
  45. # element-wise for old versions of numpy
  46. return np.equal(X, value_to_mask)
  47. def _most_frequent(array, extra_value, n_repeat):
  48. """Compute the most frequent value in a 1d array extended with
  49. [extra_value] * n_repeat, where extra_value is assumed to be not part
  50. of the array."""
  51. # Compute the most frequent value in array only
  52. if array.size > 0:
  53. with warnings.catch_warnings():
  54. # stats.mode raises a warning when input array contains objects due
  55. # to incapacity to detect NaNs. Irrelevant here since input array
  56. # has already been NaN-masked.
  57. warnings.simplefilter("ignore", RuntimeWarning)
  58. mode = stats.mode(array)
  59. most_frequent_value = mode[0][0]
  60. most_frequent_count = mode[1][0]
  61. else:
  62. most_frequent_value = 0
  63. most_frequent_count = 0
  64. # Compare to array + [extra_value] * n_repeat
  65. if most_frequent_count == 0 and n_repeat == 0:
  66. return np.nan
  67. elif most_frequent_count < n_repeat:
  68. return extra_value
  69. elif most_frequent_count > n_repeat:
  70. return most_frequent_value
  71. elif most_frequent_count == n_repeat:
  72. # Ties the breaks. Copy the behaviour of scipy.stats.mode
  73. if most_frequent_value < extra_value:
  74. return most_frequent_value
  75. else:
  76. return extra_value
  77. class SimpleImputer(BaseEstimator, TransformerMixin):
  78. """Imputation transformer for completing missing values.
  79. Read more in the :ref:`User Guide <impute>`.
  80. Parameters
  81. ----------
  82. missing_values : number, string, np.nan (default) or None
  83. The placeholder for the missing values. All occurrences of
  84. `missing_values` will be imputed.
  85. strategy : string, optional (default="mean")
  86. The imputation strategy.
  87. - If "mean", then replace missing values using the mean along
  88. each column. Can only be used with numeric data.
  89. - If "median", then replace missing values using the median along
  90. each column. Can only be used with numeric data.
  91. - If "most_frequent", then replace missing using the most frequent
  92. value along each column. Can be used with strings or numeric data.
  93. - If "constant", then replace missing values with fill_value. Can be
  94. used with strings or numeric data.
  95. .. versionadded:: 0.20
  96. strategy="constant" for fixed value imputation.
  97. fill_value : string or numerical value, optional (default=None)
  98. When strategy == "constant", fill_value is used to replace all
  99. occurrences of missing_values.
  100. If left to the default, fill_value will be 0 when imputing numerical
  101. data and "missing_value" for strings or object data types.
  102. verbose : integer, optional (default=0)
  103. Controls the verbosity of the imputer.
  104. copy : boolean, optional (default=True)
  105. If True, a copy of X will be created. If False, imputation will
  106. be done in-place whenever possible. Note that, in the following cases,
  107. a new copy will always be made, even if `copy=False`:
  108. - If X is not an array of floating values;
  109. - If X is encoded as a CSR matrix.
  110. Attributes
  111. ----------
  112. statistics_ : array of shape (n_features,)
  113. The imputation fill value for each feature.
  114. Examples
  115. --------
  116. >>> import numpy as np
  117. >>> from sklearn.impute import SimpleImputer
  118. >>> imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
  119. >>> imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])
  120. ... # doctest: +NORMALIZE_WHITESPACE
  121. SimpleImputer(copy=True, fill_value=None, missing_values=nan,
  122. strategy='mean', verbose=0)
  123. >>> X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
  124. >>> print(imp_mean.transform(X))
  125. ... # doctest: +NORMALIZE_WHITESPACE
  126. [[ 7. 2. 3. ]
  127. [ 4. 3.5 6. ]
  128. [10. 3.5 9. ]]
  129. Notes
  130. -----
  131. Columns which only contained missing values at `fit` are discarded upon
  132. `transform` if strategy is not "constant".
  133. """
  134. def __init__(self, missing_values=np.nan, strategy="mean",
  135. fill_value=None, verbose=0, copy=True):
  136. self.missing_values = missing_values
  137. self.strategy = strategy
  138. self.fill_value = fill_value
  139. self.verbose = verbose
  140. self.copy = copy
  141. def _validate_input(self, X):
  142. allowed_strategies = ["mean", "median", "most_frequent", "constant"]
  143. if self.strategy not in allowed_strategies:
  144. raise ValueError("Can only use these strategies: {0} "
  145. " got strategy={1}".format(allowed_strategies,
  146. self.strategy))
  147. if self.strategy in ("most_frequent", "constant"):
  148. dtype = None
  149. else:
  150. dtype = FLOAT_DTYPES
  151. if not is_scalar_nan(self.missing_values):
  152. force_all_finite = True
  153. else:
  154. force_all_finite = "allow-nan"
  155. try:
  156. X = check_array(X, accept_sparse='csc', dtype=dtype,
  157. force_all_finite=force_all_finite, copy=self.copy)
  158. except ValueError as ve:
  159. if "could not convert" in str(ve):
  160. raise ValueError("Cannot use {0} strategy with non-numeric "
  161. "data. Received datatype :{1}."
  162. "".format(self.strategy, X.dtype.kind))
  163. else:
  164. raise ve
  165. _check_inputs_dtype(X, self.missing_values)
  166. if X.dtype.kind not in ("i", "u", "f", "O"):
  167. raise ValueError("SimpleImputer does not support data with dtype "
  168. "{0}. Please provide either a numeric array (with"
  169. " a floating point or integer dtype) or "
  170. "categorical data represented either as an array "
  171. "with integer dtype or an array of string values "
  172. "with an object dtype.".format(X.dtype))
  173. return X
  174. def fit(self, X, y=None):
  175. """Fit the imputer on X.
  176. Parameters
  177. ----------
  178. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  179. Input data, where ``n_samples`` is the number of samples and
  180. ``n_features`` is the number of features.
  181. Returns
  182. -------
  183. self : SimpleImputer
  184. """
  185. X = self._validate_input(X)
  186. # default fill_value is 0 for numerical input and "missing_value"
  187. # otherwise
  188. if self.fill_value is None:
  189. if X.dtype.kind in ("i", "u", "f"):
  190. fill_value = 0
  191. else:
  192. fill_value = "missing_value"
  193. else:
  194. fill_value = self.fill_value
  195. # fill_value should be numerical in case of numerical input
  196. if (self.strategy == "constant" and
  197. X.dtype.kind in ("i", "u", "f") and
  198. not isinstance(fill_value, numbers.Real)):
  199. raise ValueError("'fill_value'={0} is invalid. Expected a "
  200. "numerical value when imputing numerical "
  201. "data".format(fill_value))
  202. if sparse.issparse(X):
  203. # missing_values = 0 not allowed with sparse data as it would
  204. # force densification
  205. if self.missing_values == 0:
  206. raise ValueError("Imputation not possible when missing_values "
  207. "== 0 and input is sparse. Provide a dense "
  208. "array instead.")
  209. else:
  210. self.statistics_ = self._sparse_fit(X,
  211. self.strategy,
  212. self.missing_values,
  213. fill_value)
  214. else:
  215. self.statistics_ = self._dense_fit(X,
  216. self.strategy,
  217. self.missing_values,
  218. fill_value)
  219. return self
  220. def _sparse_fit(self, X, strategy, missing_values, fill_value):
  221. """Fit the transformer on sparse data."""
  222. mask_data = _get_mask(X.data, missing_values)
  223. n_implicit_zeros = X.shape[0] - np.diff(X.indptr)
  224. statistics = np.empty(X.shape[1])
  225. if strategy == "constant":
  226. # for constant strategy, self.statistcs_ is used to store
  227. # fill_value in each column
  228. statistics.fill(fill_value)
  229. else:
  230. for i in range(X.shape[1]):
  231. column = X.data[X.indptr[i]:X.indptr[i + 1]]
  232. mask_column = mask_data[X.indptr[i]:X.indptr[i + 1]]
  233. column = column[~mask_column]
  234. # combine explicit and implicit zeros
  235. mask_zeros = _get_mask(column, 0)
  236. column = column[~mask_zeros]
  237. n_explicit_zeros = mask_zeros.sum()
  238. n_zeros = n_implicit_zeros[i] + n_explicit_zeros
  239. if strategy == "mean":
  240. s = column.size + n_zeros
  241. statistics[i] = np.nan if s == 0 else column.sum() / s
  242. elif strategy == "median":
  243. statistics[i] = _get_median(column,
  244. n_zeros)
  245. elif strategy == "most_frequent":
  246. statistics[i] = _most_frequent(column,
  247. 0,
  248. n_zeros)
  249. return statistics
  250. def _dense_fit(self, X, strategy, missing_values, fill_value):
  251. """Fit the transformer on dense data."""
  252. mask = _get_mask(X, missing_values)
  253. masked_X = ma.masked_array(X, mask=mask)
  254. # Mean
  255. if strategy == "mean":
  256. mean_masked = np.ma.mean(masked_X, axis=0)
  257. # Avoid the warning "Warning: converting a masked element to nan."
  258. mean = np.ma.getdata(mean_masked)
  259. mean[np.ma.getmask(mean_masked)] = np.nan
  260. return mean
  261. # Median
  262. elif strategy == "median":
  263. median_masked = np.ma.median(masked_X, axis=0)
  264. # Avoid the warning "Warning: converting a masked element to nan."
  265. median = np.ma.getdata(median_masked)
  266. median[np.ma.getmaskarray(median_masked)] = np.nan
  267. return median
  268. # Most frequent
  269. elif strategy == "most_frequent":
  270. # scipy.stats.mstats.mode cannot be used because it will no work
  271. # properly if the first element is masked and if its frequency
  272. # is equal to the frequency of the most frequent valid element
  273. # See https://github.com/scipy/scipy/issues/2636
  274. # To be able access the elements by columns
  275. X = X.transpose()
  276. mask = mask.transpose()
  277. if X.dtype.kind == "O":
  278. most_frequent = np.empty(X.shape[0], dtype=object)
  279. else:
  280. most_frequent = np.empty(X.shape[0])
  281. for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
  282. row_mask = np.logical_not(row_mask).astype(np.bool)
  283. row = row[row_mask]
  284. most_frequent[i] = _most_frequent(row, np.nan, 0)
  285. return most_frequent
  286. # Constant
  287. elif strategy == "constant":
  288. # for constant strategy, self.statistcs_ is used to store
  289. # fill_value in each column
  290. return np.full(X.shape[1], fill_value, dtype=X.dtype)
  291. def transform(self, X):
  292. """Impute all missing values in X.
  293. Parameters
  294. ----------
  295. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  296. The input data to complete.
  297. """
  298. check_is_fitted(self, 'statistics_')
  299. X = self._validate_input(X)
  300. statistics = self.statistics_
  301. if X.shape[1] != statistics.shape[0]:
  302. raise ValueError("X has %d features per sample, expected %d"
  303. % (X.shape[1], self.statistics_.shape[0]))
  304. # Delete the invalid columns if strategy is not constant
  305. if self.strategy == "constant":
  306. valid_statistics = statistics
  307. else:
  308. # same as np.isnan but also works for object dtypes
  309. invalid_mask = _get_mask(statistics, np.nan)
  310. valid_mask = np.logical_not(invalid_mask)
  311. valid_statistics = statistics[valid_mask]
  312. valid_statistics_indexes = np.flatnonzero(valid_mask)
  313. if invalid_mask.any():
  314. missing = np.arange(X.shape[1])[invalid_mask]
  315. if self.verbose:
  316. warnings.warn("Deleting features without "
  317. "observed values: %s" % missing)
  318. X = X[:, valid_statistics_indexes]
  319. # Do actual imputation
  320. if sparse.issparse(X):
  321. if self.missing_values == 0:
  322. raise ValueError("Imputation not possible when missing_values "
  323. "== 0 and input is sparse. Provide a dense "
  324. "array instead.")
  325. else:
  326. mask = _get_mask(X.data, self.missing_values)
  327. indexes = np.repeat(np.arange(len(X.indptr) - 1, dtype=np.int),
  328. np.diff(X.indptr))[mask]
  329. X.data[mask] = valid_statistics[indexes].astype(X.dtype,
  330. copy=False)
  331. else:
  332. mask = _get_mask(X, self.missing_values)
  333. n_missing = np.sum(mask, axis=0)
  334. values = np.repeat(valid_statistics, n_missing)
  335. coordinates = np.where(mask.transpose())[::-1]
  336. X[coordinates] = values
  337. return X
  338. class MissingIndicator(BaseEstimator, TransformerMixin):
  339. """Binary indicators for missing values.
  340. Parameters
  341. ----------
  342. missing_values : number, string, np.nan (default) or None
  343. The placeholder for the missing values. All occurrences of
  344. `missing_values` will be imputed.
  345. features : str, optional
  346. Whether the imputer mask should represent all or a subset of
  347. features.
  348. - If "missing-only" (default), the imputer mask will only represent
  349. features containing missing values during fit time.
  350. - If "all", the imputer mask will represent all features.
  351. sparse : boolean or "auto", optional
  352. Whether the imputer mask format should be sparse or dense.
  353. - If "auto" (default), the imputer mask will be of same type as
  354. input.
  355. - If True, the imputer mask will be a sparse matrix.
  356. - If False, the imputer mask will be a numpy array.
  357. error_on_new : boolean, optional
  358. If True (default), transform will raise an error when there are
  359. features with missing values in transform that have no missing values
  360. in fit This is applicable only when ``features="missing-only"``.
  361. Attributes
  362. ----------
  363. features_ : ndarray, shape (n_missing_features,) or (n_features,)
  364. The features indices which will be returned when calling ``transform``.
  365. They are computed during ``fit``. For ``features='all'``, it is
  366. to ``range(n_features)``.
  367. Examples
  368. --------
  369. >>> import numpy as np
  370. >>> from sklearn.impute import MissingIndicator
  371. >>> X1 = np.array([[np.nan, 1, 3],
  372. ... [4, 0, np.nan],
  373. ... [8, 1, 0]])
  374. >>> X2 = np.array([[5, 1, np.nan],
  375. ... [np.nan, 2, 3],
  376. ... [2, 4, 0]])
  377. >>> indicator = MissingIndicator()
  378. >>> indicator.fit(X1)
  379. MissingIndicator(error_on_new=True, features='missing-only',
  380. missing_values=nan, sparse='auto')
  381. >>> X2_tr = indicator.transform(X2)
  382. >>> X2_tr
  383. array([[False, True],
  384. [ True, False],
  385. [False, False]])
  386. """
  387. def __init__(self, missing_values=np.nan, features="missing-only",
  388. sparse="auto", error_on_new=True):
  389. self.missing_values = missing_values
  390. self.features = features
  391. self.sparse = sparse
  392. self.error_on_new = error_on_new
  393. def _get_missing_features_info(self, X):
  394. """Compute the imputer mask and the indices of the features
  395. containing missing values.
  396. Parameters
  397. ----------
  398. X : {ndarray or sparse matrix}, shape (n_samples, n_features)
  399. The input data with missing values. Note that ``X`` has been
  400. checked in ``fit`` and ``transform`` before to call this function.
  401. Returns
  402. -------
  403. imputer_mask : {ndarray or sparse matrix}, shape \
  404. (n_samples, n_features) or (n_samples, n_features_with_missing)
  405. The imputer mask of the original data.
  406. features_with_missing : ndarray, shape (n_features_with_missing)
  407. The features containing missing values.
  408. """
  409. if sparse.issparse(X) and self.missing_values != 0:
  410. mask = _get_mask(X.data, self.missing_values)
  411. # The imputer mask will be constructed with the same sparse format
  412. # as X.
  413. sparse_constructor = (sparse.csr_matrix if X.format == 'csr'
  414. else sparse.csc_matrix)
  415. imputer_mask = sparse_constructor(
  416. (mask, X.indices.copy(), X.indptr.copy()),
  417. shape=X.shape, dtype=bool)
  418. missing_values_mask = imputer_mask.copy()
  419. missing_values_mask.eliminate_zeros()
  420. features_with_missing = (
  421. np.flatnonzero(np.diff(missing_values_mask.indptr))
  422. if missing_values_mask.format == 'csc'
  423. else np.unique(missing_values_mask.indices))
  424. if self.sparse is False:
  425. imputer_mask = imputer_mask.toarray()
  426. elif imputer_mask.format == 'csr':
  427. imputer_mask = imputer_mask.tocsc()
  428. else:
  429. if sparse.issparse(X):
  430. # case of sparse matrix with 0 as missing values. Implicit and
  431. # explicit zeros are considered as missing values.
  432. X = X.toarray()
  433. imputer_mask = _get_mask(X, self.missing_values)
  434. features_with_missing = np.flatnonzero(imputer_mask.sum(axis=0))
  435. if self.sparse is True:
  436. imputer_mask = sparse.csc_matrix(imputer_mask)
  437. return imputer_mask, features_with_missing
  438. def fit(self, X, y=None):
  439. """Fit the transformer on X.
  440. Parameters
  441. ----------
  442. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  443. Input data, where ``n_samples`` is the number of samples and
  444. ``n_features`` is the number of features.
  445. Returns
  446. -------
  447. self : object
  448. Returns self.
  449. """
  450. if not is_scalar_nan(self.missing_values):
  451. force_all_finite = True
  452. else:
  453. force_all_finite = "allow-nan"
  454. X = check_array(X, accept_sparse=('csc', 'csr'),
  455. force_all_finite=force_all_finite)
  456. _check_inputs_dtype(X, self.missing_values)
  457. self._n_features = X.shape[1]
  458. if self.features not in ('missing-only', 'all'):
  459. raise ValueError("'features' has to be either 'missing-only' or "
  460. "'all'. Got {} instead.".format(self.features))
  461. if not ((isinstance(self.sparse, six.string_types) and
  462. self.sparse == "auto") or isinstance(self.sparse, bool)):
  463. raise ValueError("'sparse' has to be a boolean or 'auto'. "
  464. "Got {!r} instead.".format(self.sparse))
  465. self.features_ = (self._get_missing_features_info(X)[1]
  466. if self.features == 'missing-only'
  467. else np.arange(self._n_features))
  468. return self
  469. def transform(self, X):
  470. """Generate missing values indicator for X.
  471. Parameters
  472. ----------
  473. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  474. The input data to complete.
  475. Returns
  476. -------
  477. Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
  478. The missing indicator for input data. The data type of ``Xt``
  479. will be boolean.
  480. """
  481. check_is_fitted(self, "features_")
  482. if not is_scalar_nan(self.missing_values):
  483. force_all_finite = True
  484. else:
  485. force_all_finite = "allow-nan"
  486. X = check_array(X, accept_sparse=('csc', 'csr'),
  487. force_all_finite=force_all_finite)
  488. _check_inputs_dtype(X, self.missing_values)
  489. if X.shape[1] != self._n_features:
  490. raise ValueError("X has a different number of features "
  491. "than during fitting.")
  492. imputer_mask, features = self._get_missing_features_info(X)
  493. if self.features == "missing-only":
  494. features_diff_fit_trans = np.setdiff1d(features, self.features_)
  495. if (self.error_on_new and features_diff_fit_trans.size > 0):
  496. raise ValueError("The features {} have missing values "
  497. "in transform but have no missing values "
  498. "in fit.".format(features_diff_fit_trans))
  499. if (self.features_.size > 0 and
  500. self.features_.size < self._n_features):
  501. imputer_mask = imputer_mask[:, self.features_]
  502. return imputer_mask
  503. def fit_transform(self, X, y=None):
  504. """Generate missing values indicator for X.
  505. Parameters
  506. ----------
  507. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  508. The input data to complete.
  509. Returns
  510. -------
  511. Xt : {ndarray or sparse matrix}, shape (n_samples, n_features)
  512. The missing indicator for input data. The data type of ``Xt``
  513. will be boolean.
  514. """
  515. return self.fit(X, y).transform(X)