122 lines
4.1 KiB
Python
122 lines
4.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""Generic feature selection mixin"""
|
|
|
|
# Authors: G. Varoquaux, A. Gramfort, L. Buitinck, J. Nothman
|
|
# License: BSD 3 clause
|
|
|
|
from abc import ABCMeta, abstractmethod
|
|
from warnings import warn
|
|
|
|
import numpy as np
|
|
from scipy.sparse import issparse, csc_matrix
|
|
|
|
from ..base import TransformerMixin
|
|
from ..utils import check_array, safe_mask
|
|
from ..externals import six
|
|
|
|
|
|
class SelectorMixin(six.with_metaclass(ABCMeta, TransformerMixin)):
|
|
"""
|
|
Transformer mixin that performs feature selection given a support mask
|
|
|
|
This mixin provides a feature selector implementation with `transform` and
|
|
`inverse_transform` functionality given an implementation of
|
|
`_get_support_mask`.
|
|
"""
|
|
|
|
def get_support(self, indices=False):
|
|
"""
|
|
Get a mask, or integer index, of the features selected
|
|
|
|
Parameters
|
|
----------
|
|
indices : boolean (default False)
|
|
If True, the return value will be an array of integers, rather
|
|
than a boolean mask.
|
|
|
|
Returns
|
|
-------
|
|
support : array
|
|
An index that selects the retained features from a feature vector.
|
|
If `indices` is False, this is a boolean array of shape
|
|
[# input features], in which an element is True iff its
|
|
corresponding feature is selected for retention. If `indices` is
|
|
True, this is an integer array of shape [# output features] whose
|
|
values are indices into the input feature vector.
|
|
"""
|
|
mask = self._get_support_mask()
|
|
return mask if not indices else np.where(mask)[0]
|
|
|
|
@abstractmethod
|
|
def _get_support_mask(self):
|
|
"""
|
|
Get the boolean mask indicating which features are selected
|
|
|
|
Returns
|
|
-------
|
|
support : boolean array of shape [# input features]
|
|
An element is True iff its corresponding feature is selected for
|
|
retention.
|
|
"""
|
|
|
|
def transform(self, X):
|
|
"""Reduce X to the selected features.
|
|
|
|
Parameters
|
|
----------
|
|
X : array of shape [n_samples, n_features]
|
|
The input samples.
|
|
|
|
Returns
|
|
-------
|
|
X_r : array of shape [n_samples, n_selected_features]
|
|
The input samples with only the selected features.
|
|
"""
|
|
X = check_array(X, accept_sparse='csr')
|
|
mask = self.get_support()
|
|
if not mask.any():
|
|
warn("No features were selected: either the data is"
|
|
" too noisy or the selection test too strict.",
|
|
UserWarning)
|
|
return np.empty(0).reshape((X.shape[0], 0))
|
|
if len(mask) != X.shape[1]:
|
|
raise ValueError("X has a different shape than during fitting.")
|
|
return X[:, safe_mask(X, mask)]
|
|
|
|
def inverse_transform(self, X):
|
|
"""
|
|
Reverse the transformation operation
|
|
|
|
Parameters
|
|
----------
|
|
X : array of shape [n_samples, n_selected_features]
|
|
The input samples.
|
|
|
|
Returns
|
|
-------
|
|
X_r : array of shape [n_samples, n_original_features]
|
|
`X` with columns of zeros inserted where features would have
|
|
been removed by `transform`.
|
|
"""
|
|
if issparse(X):
|
|
X = X.tocsc()
|
|
# insert additional entries in indptr:
|
|
# e.g. if transform changed indptr from [0 2 6 7] to [0 2 3]
|
|
# col_nonzeros here will be [2 0 1] so indptr becomes [0 2 2 3]
|
|
it = self.inverse_transform(np.diff(X.indptr).reshape(1, -1))
|
|
col_nonzeros = it.ravel()
|
|
indptr = np.concatenate([[0], np.cumsum(col_nonzeros)])
|
|
Xt = csc_matrix((X.data, X.indices, indptr),
|
|
shape=(X.shape[0], len(indptr) - 1), dtype=X.dtype)
|
|
return Xt
|
|
|
|
support = self.get_support()
|
|
X = check_array(X)
|
|
if support.sum() != X.shape[1]:
|
|
raise ValueError("X has a different shape than during fitting.")
|
|
|
|
if X.ndim == 1:
|
|
X = X[None, :]
|
|
Xt = np.zeros((X.shape[0], support.size), dtype=X.dtype)
|
|
Xt[:, support] = X
|
|
return Xt
|