339 lines
12 KiB
Python
339 lines
12 KiB
Python
"""
|
|
Soft Voting/Majority Rule classifier.
|
|
|
|
This module contains a Soft Voting/Majority Rule classifier for
|
|
classification estimators.
|
|
|
|
"""
|
|
|
|
# Authors: Sebastian Raschka <se.raschka@gmail.com>,
|
|
# Gilles Louppe <g.louppe@gmail.com>
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
import numpy as np
|
|
import warnings
|
|
|
|
from ..base import ClassifierMixin
|
|
from ..base import TransformerMixin
|
|
from ..base import clone
|
|
from ..preprocessing import LabelEncoder
|
|
from ..externals.joblib import Parallel, delayed
|
|
from ..utils.validation import has_fit_parameter, check_is_fitted
|
|
from ..utils.metaestimators import _BaseComposition
|
|
|
|
|
|
def _parallel_fit_estimator(estimator, X, y, sample_weight=None):
|
|
"""Private function used to fit an estimator within a job."""
|
|
if sample_weight is not None:
|
|
estimator.fit(X, y, sample_weight=sample_weight)
|
|
else:
|
|
estimator.fit(X, y)
|
|
return estimator
|
|
|
|
|
|
class VotingClassifier(_BaseComposition, ClassifierMixin, TransformerMixin):
|
|
"""Soft Voting/Majority Rule classifier for unfitted estimators.
|
|
|
|
.. versionadded:: 0.17
|
|
|
|
Read more in the :ref:`User Guide <voting_classifier>`.
|
|
|
|
Parameters
|
|
----------
|
|
estimators : list of (string, estimator) tuples
|
|
Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
|
|
of those original estimators that will be stored in the class attribute
|
|
``self.estimators_``. An estimator can be set to `None` using
|
|
``set_params``.
|
|
|
|
voting : str, {'hard', 'soft'} (default='hard')
|
|
If 'hard', uses predicted class labels for majority rule voting.
|
|
Else if 'soft', predicts the class label based on the argmax of
|
|
the sums of the predicted probabilities, which is recommended for
|
|
an ensemble of well-calibrated classifiers.
|
|
|
|
weights : array-like, shape = [n_classifiers], optional (default=`None`)
|
|
Sequence of weights (`float` or `int`) to weight the occurrences of
|
|
predicted class labels (`hard` voting) or class probabilities
|
|
before averaging (`soft` voting). Uses uniform weights if `None`.
|
|
|
|
n_jobs : int, optional (default=1)
|
|
The number of jobs to run in parallel for ``fit``.
|
|
If -1, then the number of jobs is set to the number of cores.
|
|
|
|
flatten_transform : bool, optional (default=None)
|
|
Affects shape of transform output only when voting='soft'
|
|
If voting='soft' and flatten_transform=True, transform method returns
|
|
matrix with shape (n_samples, n_classifiers * n_classes). If
|
|
flatten_transform=False, it returns
|
|
(n_classifiers, n_samples, n_classes).
|
|
|
|
Attributes
|
|
----------
|
|
estimators_ : list of classifiers
|
|
The collection of fitted sub-estimators as defined in ``estimators``
|
|
that are not `None`.
|
|
|
|
classes_ : array-like, shape = [n_predictions]
|
|
The classes labels.
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> from sklearn.linear_model import LogisticRegression
|
|
>>> from sklearn.naive_bayes import GaussianNB
|
|
>>> from sklearn.ensemble import RandomForestClassifier, VotingClassifier
|
|
>>> clf1 = LogisticRegression(random_state=1)
|
|
>>> clf2 = RandomForestClassifier(random_state=1)
|
|
>>> clf3 = GaussianNB()
|
|
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
|
|
>>> y = np.array([1, 1, 1, 2, 2, 2])
|
|
>>> eclf1 = VotingClassifier(estimators=[
|
|
... ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
|
|
>>> eclf1 = eclf1.fit(X, y)
|
|
>>> print(eclf1.predict(X))
|
|
[1 1 1 2 2 2]
|
|
>>> eclf2 = VotingClassifier(estimators=[
|
|
... ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
|
... voting='soft')
|
|
>>> eclf2 = eclf2.fit(X, y)
|
|
>>> print(eclf2.predict(X))
|
|
[1 1 1 2 2 2]
|
|
>>> eclf3 = VotingClassifier(estimators=[
|
|
... ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
|
... voting='soft', weights=[2,1,1],
|
|
... flatten_transform=True)
|
|
>>> eclf3 = eclf3.fit(X, y)
|
|
>>> print(eclf3.predict(X))
|
|
[1 1 1 2 2 2]
|
|
>>> print(eclf3.transform(X).shape)
|
|
(6, 6)
|
|
>>>
|
|
"""
|
|
|
|
def __init__(self, estimators, voting='hard', weights=None, n_jobs=1,
|
|
flatten_transform=None):
|
|
self.estimators = estimators
|
|
self.voting = voting
|
|
self.weights = weights
|
|
self.n_jobs = n_jobs
|
|
self.flatten_transform = flatten_transform
|
|
|
|
@property
|
|
def named_estimators(self):
|
|
return dict(self.estimators)
|
|
|
|
def fit(self, X, y, sample_weight=None):
|
|
""" Fit the estimators.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
|
|
Training vectors, where n_samples is the number of samples and
|
|
n_features is the number of features.
|
|
|
|
y : array-like, shape = [n_samples]
|
|
Target values.
|
|
|
|
sample_weight : array-like, shape = [n_samples] or None
|
|
Sample weights. If None, then samples are equally weighted.
|
|
Note that this is supported only if all underlying estimators
|
|
support sample weights.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
"""
|
|
if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
|
|
raise NotImplementedError('Multilabel and multi-output'
|
|
' classification is not supported.')
|
|
|
|
if self.voting not in ('soft', 'hard'):
|
|
raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
|
|
% self.voting)
|
|
|
|
if self.estimators is None or len(self.estimators) == 0:
|
|
raise AttributeError('Invalid `estimators` attribute, `estimators`'
|
|
' should be a list of (string, estimator)'
|
|
' tuples')
|
|
|
|
if (self.weights is not None and
|
|
len(self.weights) != len(self.estimators)):
|
|
raise ValueError('Number of classifiers and weights must be equal'
|
|
'; got %d weights, %d estimators'
|
|
% (len(self.weights), len(self.estimators)))
|
|
|
|
if sample_weight is not None:
|
|
for name, step in self.estimators:
|
|
if not has_fit_parameter(step, 'sample_weight'):
|
|
raise ValueError('Underlying estimator \'%s\' does not'
|
|
' support sample weights.' % name)
|
|
names, clfs = zip(*self.estimators)
|
|
self._validate_names(names)
|
|
|
|
n_isnone = np.sum([clf is None for _, clf in self.estimators])
|
|
if n_isnone == len(self.estimators):
|
|
raise ValueError('All estimators are None. At least one is '
|
|
'required to be a classifier!')
|
|
|
|
self.le_ = LabelEncoder().fit(y)
|
|
self.classes_ = self.le_.classes_
|
|
self.estimators_ = []
|
|
|
|
transformed_y = self.le_.transform(y)
|
|
|
|
self.estimators_ = Parallel(n_jobs=self.n_jobs)(
|
|
delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y,
|
|
sample_weight=sample_weight)
|
|
for clf in clfs if clf is not None)
|
|
|
|
return self
|
|
|
|
@property
|
|
def _weights_not_none(self):
|
|
"""Get the weights of not `None` estimators"""
|
|
if self.weights is None:
|
|
return None
|
|
return [w for est, w in zip(self.estimators,
|
|
self.weights) if est[1] is not None]
|
|
|
|
def predict(self, X):
|
|
""" Predict class labels for X.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
|
|
Training vectors, where n_samples is the number of samples and
|
|
n_features is the number of features.
|
|
|
|
Returns
|
|
----------
|
|
maj : array-like, shape = [n_samples]
|
|
Predicted class labels.
|
|
"""
|
|
|
|
check_is_fitted(self, 'estimators_')
|
|
if self.voting == 'soft':
|
|
maj = np.argmax(self.predict_proba(X), axis=1)
|
|
|
|
else: # 'hard' voting
|
|
predictions = self._predict(X)
|
|
maj = np.apply_along_axis(
|
|
lambda x: np.argmax(
|
|
np.bincount(x, weights=self._weights_not_none)),
|
|
axis=1, arr=predictions)
|
|
|
|
maj = self.le_.inverse_transform(maj)
|
|
|
|
return maj
|
|
|
|
def _collect_probas(self, X):
|
|
"""Collect results from clf.predict calls. """
|
|
return np.asarray([clf.predict_proba(X) for clf in self.estimators_])
|
|
|
|
def _predict_proba(self, X):
|
|
"""Predict class probabilities for X in 'soft' voting """
|
|
if self.voting == 'hard':
|
|
raise AttributeError("predict_proba is not available when"
|
|
" voting=%r" % self.voting)
|
|
check_is_fitted(self, 'estimators_')
|
|
avg = np.average(self._collect_probas(X), axis=0,
|
|
weights=self._weights_not_none)
|
|
return avg
|
|
|
|
@property
|
|
def predict_proba(self):
|
|
"""Compute probabilities of possible outcomes for samples in X.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
|
|
Training vectors, where n_samples is the number of samples and
|
|
n_features is the number of features.
|
|
|
|
Returns
|
|
----------
|
|
avg : array-like, shape = [n_samples, n_classes]
|
|
Weighted average probability for each class per sample.
|
|
"""
|
|
return self._predict_proba
|
|
|
|
def transform(self, X):
|
|
"""Return class labels or probabilities for X for each estimator.
|
|
|
|
Parameters
|
|
----------
|
|
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
|
|
Training vectors, where n_samples is the number of samples and
|
|
n_features is the number of features.
|
|
|
|
Returns
|
|
-------
|
|
If `voting='soft'` and `flatten_transform=True`:
|
|
array-like = (n_classifiers, n_samples * n_classes)
|
|
otherwise array-like = (n_classifiers, n_samples, n_classes)
|
|
Class probabilities calculated by each classifier.
|
|
If `voting='hard'`:
|
|
array-like = [n_samples, n_classifiers]
|
|
Class labels predicted by each classifier.
|
|
"""
|
|
check_is_fitted(self, 'estimators_')
|
|
|
|
if self.voting == 'soft':
|
|
probas = self._collect_probas(X)
|
|
if self.flatten_transform is None:
|
|
warnings.warn("'flatten_transform' default value will be "
|
|
"changed to True in 0.21."
|
|
"To silence this warning you may"
|
|
" explicitly set flatten_transform=False.",
|
|
DeprecationWarning)
|
|
return probas
|
|
elif not self.flatten_transform:
|
|
return probas
|
|
else:
|
|
return np.hstack(probas)
|
|
|
|
else:
|
|
return self._predict(X)
|
|
|
|
def set_params(self, **params):
|
|
""" Setting the parameters for the voting classifier
|
|
|
|
Valid parameter keys can be listed with get_params().
|
|
|
|
Parameters
|
|
----------
|
|
params: keyword arguments
|
|
Specific parameters using e.g. set_params(parameter_name=new_value)
|
|
In addition, to setting the parameters of the ``VotingClassifier``,
|
|
the individual classifiers of the ``VotingClassifier`` can also be
|
|
set or replaced by setting them to None.
|
|
|
|
Examples
|
|
--------
|
|
# In this example, the RandomForestClassifier is removed
|
|
clf1 = LogisticRegression()
|
|
clf2 = RandomForestClassifier()
|
|
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)]
|
|
eclf.set_params(rf=None)
|
|
|
|
"""
|
|
super(VotingClassifier, self)._set_params('estimators', **params)
|
|
return self
|
|
|
|
def get_params(self, deep=True):
|
|
""" Get the parameters of the VotingClassifier
|
|
|
|
Parameters
|
|
----------
|
|
deep: bool
|
|
Setting it to True gets the various classifiers and the parameters
|
|
of the classifiers as well
|
|
"""
|
|
return super(VotingClassifier,
|
|
self)._get_params('estimators', deep=deep)
|
|
|
|
def _predict(self, X):
|
|
"""Collect results from clf.predict calls. """
|
|
return np.asarray([clf.predict(X) for clf in self.estimators_]).T
|