418 lines
17 KiB
Python
418 lines
17 KiB
Python
|
"""Testing for the VotingClassifier"""
|
||
|
|
||
|
import numpy as np
|
||
|
from sklearn.utils.testing import assert_almost_equal, assert_array_equal
|
||
|
from sklearn.utils.testing import assert_equal, assert_true, assert_false
|
||
|
from sklearn.utils.testing import assert_raise_message
|
||
|
from sklearn.utils.testing import assert_warns_message
|
||
|
from sklearn.exceptions import NotFittedError
|
||
|
from sklearn.linear_model import LogisticRegression
|
||
|
from sklearn.naive_bayes import GaussianNB
|
||
|
from sklearn.ensemble import RandomForestClassifier
|
||
|
from sklearn.ensemble import VotingClassifier
|
||
|
from sklearn.model_selection import GridSearchCV
|
||
|
from sklearn import datasets
|
||
|
from sklearn.model_selection import cross_val_score
|
||
|
from sklearn.datasets import make_multilabel_classification
|
||
|
from sklearn.svm import SVC
|
||
|
from sklearn.multiclass import OneVsRestClassifier
|
||
|
from sklearn.neighbors import KNeighborsClassifier
|
||
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||
|
|
||
|
|
||
|
# Load the iris dataset and randomly permute it
|
||
|
iris = datasets.load_iris()
|
||
|
X, y = iris.data[:, 1:3], iris.target
|
||
|
|
||
|
|
||
|
def test_estimator_init():
|
||
|
eclf = VotingClassifier(estimators=[])
|
||
|
msg = ('Invalid `estimators` attribute, `estimators` should be'
|
||
|
' a list of (string, estimator) tuples')
|
||
|
assert_raise_message(AttributeError, msg, eclf.fit, X, y)
|
||
|
|
||
|
clf = LogisticRegression(random_state=1)
|
||
|
|
||
|
eclf = VotingClassifier(estimators=[('lr', clf)], voting='error')
|
||
|
msg = ('Voting must be \'soft\' or \'hard\'; got (voting=\'error\')')
|
||
|
assert_raise_message(ValueError, msg, eclf.fit, X, y)
|
||
|
|
||
|
eclf = VotingClassifier(estimators=[('lr', clf)], weights=[1, 2])
|
||
|
msg = ('Number of classifiers and weights must be equal'
|
||
|
'; got 2 weights, 1 estimators')
|
||
|
assert_raise_message(ValueError, msg, eclf.fit, X, y)
|
||
|
|
||
|
eclf = VotingClassifier(estimators=[('lr', clf), ('lr', clf)],
|
||
|
weights=[1, 2])
|
||
|
msg = "Names provided are not unique: ['lr', 'lr']"
|
||
|
assert_raise_message(ValueError, msg, eclf.fit, X, y)
|
||
|
|
||
|
eclf = VotingClassifier(estimators=[('lr__', clf)])
|
||
|
msg = "Estimator names must not contain __: got ['lr__']"
|
||
|
assert_raise_message(ValueError, msg, eclf.fit, X, y)
|
||
|
|
||
|
eclf = VotingClassifier(estimators=[('estimators', clf)])
|
||
|
msg = "Estimator names conflict with constructor arguments: ['estimators']"
|
||
|
assert_raise_message(ValueError, msg, eclf.fit, X, y)
|
||
|
|
||
|
|
||
|
def test_predictproba_hardvoting():
|
||
|
eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()),
|
||
|
('lr2', LogisticRegression())],
|
||
|
voting='hard')
|
||
|
msg = "predict_proba is not available when voting='hard'"
|
||
|
assert_raise_message(AttributeError, msg, eclf.predict_proba, X)
|
||
|
|
||
|
|
||
|
def test_notfitted():
|
||
|
eclf = VotingClassifier(estimators=[('lr1', LogisticRegression()),
|
||
|
('lr2', LogisticRegression())],
|
||
|
voting='soft')
|
||
|
msg = ("This VotingClassifier instance is not fitted yet. Call \'fit\'"
|
||
|
" with appropriate arguments before using this method.")
|
||
|
assert_raise_message(NotFittedError, msg, eclf.predict_proba, X)
|
||
|
|
||
|
|
||
|
def test_majority_label_iris():
|
||
|
"""Check classification by majority label on dataset iris."""
|
||
|
clf1 = LogisticRegression(random_state=123)
|
||
|
clf2 = RandomForestClassifier(random_state=123)
|
||
|
clf3 = GaussianNB()
|
||
|
eclf = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||
|
voting='hard')
|
||
|
scores = cross_val_score(eclf, X, y, cv=5, scoring='accuracy')
|
||
|
assert_almost_equal(scores.mean(), 0.95, decimal=2)
|
||
|
|
||
|
|
||
|
def test_tie_situation():
|
||
|
"""Check voting classifier selects smaller class label in tie situation."""
|
||
|
clf1 = LogisticRegression(random_state=123)
|
||
|
clf2 = RandomForestClassifier(random_state=123)
|
||
|
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)],
|
||
|
voting='hard')
|
||
|
assert_equal(clf1.fit(X, y).predict(X)[73], 2)
|
||
|
assert_equal(clf2.fit(X, y).predict(X)[73], 1)
|
||
|
assert_equal(eclf.fit(X, y).predict(X)[73], 1)
|
||
|
|
||
|
|
||
|
def test_weights_iris():
|
||
|
"""Check classification by average probabilities on dataset iris."""
|
||
|
clf1 = LogisticRegression(random_state=123)
|
||
|
clf2 = RandomForestClassifier(random_state=123)
|
||
|
clf3 = GaussianNB()
|
||
|
eclf = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||
|
voting='soft',
|
||
|
weights=[1, 2, 10])
|
||
|
scores = cross_val_score(eclf, X, y, cv=5, scoring='accuracy')
|
||
|
assert_almost_equal(scores.mean(), 0.93, decimal=2)
|
||
|
|
||
|
|
||
|
def test_predict_on_toy_problem():
|
||
|
"""Manually check predicted class labels for toy dataset."""
|
||
|
clf1 = LogisticRegression(random_state=123)
|
||
|
clf2 = RandomForestClassifier(random_state=123)
|
||
|
clf3 = GaussianNB()
|
||
|
|
||
|
X = np.array([[-1.1, -1.5],
|
||
|
[-1.2, -1.4],
|
||
|
[-3.4, -2.2],
|
||
|
[1.1, 1.2],
|
||
|
[2.1, 1.4],
|
||
|
[3.1, 2.3]])
|
||
|
|
||
|
y = np.array([1, 1, 1, 2, 2, 2])
|
||
|
|
||
|
assert_equal(all(clf1.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
|
||
|
assert_equal(all(clf2.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
|
||
|
assert_equal(all(clf3.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
|
||
|
|
||
|
eclf = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||
|
voting='hard',
|
||
|
weights=[1, 1, 1])
|
||
|
assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
|
||
|
|
||
|
eclf = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||
|
voting='soft',
|
||
|
weights=[1, 1, 1])
|
||
|
assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
|
||
|
|
||
|
|
||
|
def test_predict_proba_on_toy_problem():
|
||
|
"""Calculate predicted probabilities on toy dataset."""
|
||
|
clf1 = LogisticRegression(random_state=123)
|
||
|
clf2 = RandomForestClassifier(random_state=123)
|
||
|
clf3 = GaussianNB()
|
||
|
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||
|
y = np.array([1, 1, 2, 2])
|
||
|
|
||
|
clf1_res = np.array([[0.59790391, 0.40209609],
|
||
|
[0.57622162, 0.42377838],
|
||
|
[0.50728456, 0.49271544],
|
||
|
[0.40241774, 0.59758226]])
|
||
|
|
||
|
clf2_res = np.array([[0.8, 0.2],
|
||
|
[0.8, 0.2],
|
||
|
[0.2, 0.8],
|
||
|
[0.3, 0.7]])
|
||
|
|
||
|
clf3_res = np.array([[0.9985082, 0.0014918],
|
||
|
[0.99845843, 0.00154157],
|
||
|
[0., 1.],
|
||
|
[0., 1.]])
|
||
|
|
||
|
t00 = (2*clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4
|
||
|
t11 = (2*clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4
|
||
|
t21 = (2*clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4
|
||
|
t31 = (2*clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4
|
||
|
|
||
|
eclf = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||
|
voting='soft',
|
||
|
weights=[2, 1, 1])
|
||
|
eclf_res = eclf.fit(X, y).predict_proba(X)
|
||
|
|
||
|
assert_almost_equal(t00, eclf_res[0][0], decimal=1)
|
||
|
assert_almost_equal(t11, eclf_res[1][1], decimal=1)
|
||
|
assert_almost_equal(t21, eclf_res[2][1], decimal=1)
|
||
|
assert_almost_equal(t31, eclf_res[3][1], decimal=1)
|
||
|
|
||
|
try:
|
||
|
eclf = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||
|
voting='hard')
|
||
|
eclf.fit(X, y).predict_proba(X)
|
||
|
|
||
|
except AttributeError:
|
||
|
pass
|
||
|
else:
|
||
|
raise AssertionError('AttributeError for voting == "hard"'
|
||
|
' and with predict_proba not raised')
|
||
|
|
||
|
|
||
|
def test_multilabel():
|
||
|
"""Check if error is raised for multilabel classification."""
|
||
|
X, y = make_multilabel_classification(n_classes=2, n_labels=1,
|
||
|
allow_unlabeled=False,
|
||
|
random_state=123)
|
||
|
clf = OneVsRestClassifier(SVC(kernel='linear'))
|
||
|
|
||
|
eclf = VotingClassifier(estimators=[('ovr', clf)], voting='hard')
|
||
|
|
||
|
try:
|
||
|
eclf.fit(X, y)
|
||
|
except NotImplementedError:
|
||
|
return
|
||
|
|
||
|
|
||
|
def test_gridsearch():
|
||
|
"""Check GridSearch support."""
|
||
|
clf1 = LogisticRegression(random_state=1)
|
||
|
clf2 = RandomForestClassifier(random_state=1)
|
||
|
clf3 = GaussianNB()
|
||
|
eclf = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||
|
voting='soft')
|
||
|
|
||
|
params = {'lr__C': [1.0, 100.0],
|
||
|
'voting': ['soft', 'hard'],
|
||
|
'weights': [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]]}
|
||
|
|
||
|
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
|
||
|
grid.fit(iris.data, iris.target)
|
||
|
|
||
|
|
||
|
def test_parallel_fit():
|
||
|
"""Check parallel backend of VotingClassifier on toy dataset."""
|
||
|
clf1 = LogisticRegression(random_state=123)
|
||
|
clf2 = RandomForestClassifier(random_state=123)
|
||
|
clf3 = GaussianNB()
|
||
|
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||
|
y = np.array([1, 1, 2, 2])
|
||
|
|
||
|
eclf1 = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||
|
voting='soft',
|
||
|
n_jobs=1).fit(X, y)
|
||
|
eclf2 = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||
|
voting='soft',
|
||
|
n_jobs=2).fit(X, y)
|
||
|
|
||
|
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||
|
assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||
|
|
||
|
|
||
|
def test_sample_weight():
|
||
|
"""Tests sample_weight parameter of VotingClassifier"""
|
||
|
clf1 = LogisticRegression(random_state=123)
|
||
|
clf2 = RandomForestClassifier(random_state=123)
|
||
|
clf3 = SVC(probability=True, random_state=123)
|
||
|
eclf1 = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2), ('svc', clf3)],
|
||
|
voting='soft').fit(X, y, sample_weight=np.ones((len(y),)))
|
||
|
eclf2 = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2), ('svc', clf3)],
|
||
|
voting='soft').fit(X, y)
|
||
|
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||
|
assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||
|
|
||
|
sample_weight = np.random.RandomState(123).uniform(size=(len(y),))
|
||
|
eclf3 = VotingClassifier(estimators=[('lr', clf1)], voting='soft')
|
||
|
eclf3.fit(X, y, sample_weight)
|
||
|
clf1.fit(X, y, sample_weight)
|
||
|
assert_array_equal(eclf3.predict(X), clf1.predict(X))
|
||
|
assert_array_equal(eclf3.predict_proba(X), clf1.predict_proba(X))
|
||
|
|
||
|
clf4 = KNeighborsClassifier()
|
||
|
eclf3 = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('svc', clf3), ('knn', clf4)],
|
||
|
voting='soft')
|
||
|
msg = ('Underlying estimator \'knn\' does not support sample weights.')
|
||
|
assert_raise_message(ValueError, msg, eclf3.fit, X, y, sample_weight)
|
||
|
|
||
|
|
||
|
def test_sample_weight_kwargs():
|
||
|
"""Check that VotingClassifier passes sample_weight as kwargs"""
|
||
|
class MockClassifier(BaseEstimator, ClassifierMixin):
|
||
|
"""Mock Classifier to check that sample_weight is received as kwargs"""
|
||
|
def fit(self, X, y, *args, **sample_weight):
|
||
|
assert_true('sample_weight' in sample_weight)
|
||
|
|
||
|
clf = MockClassifier()
|
||
|
eclf = VotingClassifier(estimators=[('mock', clf)], voting='soft')
|
||
|
|
||
|
# Should not raise an error.
|
||
|
eclf.fit(X, y, sample_weight=np.ones((len(y),)))
|
||
|
|
||
|
|
||
|
def test_set_params():
|
||
|
"""set_params should be able to set estimators"""
|
||
|
clf1 = LogisticRegression(random_state=123, C=1.0)
|
||
|
clf2 = RandomForestClassifier(random_state=123, max_depth=None)
|
||
|
clf3 = GaussianNB()
|
||
|
eclf1 = VotingClassifier([('lr', clf1), ('rf', clf2)], voting='soft',
|
||
|
weights=[1, 2])
|
||
|
eclf1.fit(X, y)
|
||
|
eclf2 = VotingClassifier([('lr', clf1), ('nb', clf3)], voting='soft',
|
||
|
weights=[1, 2])
|
||
|
eclf2.set_params(nb=clf2).fit(X, y)
|
||
|
assert_false(hasattr(eclf2, 'nb'))
|
||
|
|
||
|
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||
|
assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||
|
assert_equal(eclf2.estimators[0][1].get_params(), clf1.get_params())
|
||
|
assert_equal(eclf2.estimators[1][1].get_params(), clf2.get_params())
|
||
|
|
||
|
eclf1.set_params(lr__C=10.0)
|
||
|
eclf2.set_params(nb__max_depth=5)
|
||
|
|
||
|
assert_true(eclf1.estimators[0][1].get_params()['C'] == 10.0)
|
||
|
assert_true(eclf2.estimators[1][1].get_params()['max_depth'] == 5)
|
||
|
assert_equal(eclf1.get_params()["lr__C"],
|
||
|
eclf1.get_params()["lr"].get_params()['C'])
|
||
|
|
||
|
|
||
|
def test_set_estimator_none():
|
||
|
"""VotingClassifier set_params should be able to set estimators as None"""
|
||
|
# Test predict
|
||
|
clf1 = LogisticRegression(random_state=123)
|
||
|
clf2 = RandomForestClassifier(random_state=123)
|
||
|
clf3 = GaussianNB()
|
||
|
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
|
||
|
('nb', clf3)],
|
||
|
voting='hard', weights=[1, 0, 0.5]).fit(X, y)
|
||
|
|
||
|
eclf2 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
|
||
|
('nb', clf3)],
|
||
|
voting='hard', weights=[1, 1, 0.5])
|
||
|
eclf2.set_params(rf=None).fit(X, y)
|
||
|
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||
|
|
||
|
assert_true(dict(eclf2.estimators)["rf"] is None)
|
||
|
assert_true(len(eclf2.estimators_) == 2)
|
||
|
assert_true(all([not isinstance(est, RandomForestClassifier) for est in
|
||
|
eclf2.estimators_]))
|
||
|
assert_true(eclf2.get_params()["rf"] is None)
|
||
|
|
||
|
eclf1.set_params(voting='soft').fit(X, y)
|
||
|
eclf2.set_params(voting='soft').fit(X, y)
|
||
|
assert_array_equal(eclf1.predict(X), eclf2.predict(X))
|
||
|
assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||
|
msg = ('All estimators are None. At least one is required'
|
||
|
' to be a classifier!')
|
||
|
assert_raise_message(
|
||
|
ValueError, msg, eclf2.set_params(lr=None, rf=None, nb=None).fit, X, y)
|
||
|
|
||
|
# Test soft voting transform
|
||
|
X1 = np.array([[1], [2]])
|
||
|
y1 = np.array([1, 2])
|
||
|
eclf1 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
|
||
|
voting='soft', weights=[0, 0.5]).fit(X1, y1)
|
||
|
|
||
|
eclf2 = VotingClassifier(estimators=[('rf', clf2), ('nb', clf3)],
|
||
|
voting='soft', weights=[1, 0.5])
|
||
|
eclf2.set_params(rf=None).fit(X1, y1)
|
||
|
assert_array_equal(eclf1.transform(X1), np.array([[[0.7, 0.3], [0.3, 0.7]],
|
||
|
[[1., 0.], [0., 1.]]]))
|
||
|
assert_array_equal(eclf2.transform(X1), np.array([[[1., 0.], [0., 1.]]]))
|
||
|
eclf1.set_params(voting='hard')
|
||
|
eclf2.set_params(voting='hard')
|
||
|
assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
|
||
|
assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
|
||
|
|
||
|
|
||
|
def test_estimator_weights_format():
|
||
|
# Test estimator weights inputs as list and array
|
||
|
clf1 = LogisticRegression(random_state=123)
|
||
|
clf2 = RandomForestClassifier(random_state=123)
|
||
|
eclf1 = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2)],
|
||
|
weights=[1, 2],
|
||
|
voting='soft')
|
||
|
eclf2 = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2)],
|
||
|
weights=np.array((1, 2)),
|
||
|
voting='soft')
|
||
|
eclf1.fit(X, y)
|
||
|
eclf2.fit(X, y)
|
||
|
assert_array_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
|
||
|
|
||
|
|
||
|
def test_transform():
|
||
|
"""Check transform method of VotingClassifier on toy dataset."""
|
||
|
clf1 = LogisticRegression(random_state=123)
|
||
|
clf2 = RandomForestClassifier(random_state=123)
|
||
|
clf3 = GaussianNB()
|
||
|
X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
|
||
|
y = np.array([1, 1, 2, 2])
|
||
|
|
||
|
eclf1 = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||
|
voting='soft').fit(X, y)
|
||
|
eclf2 = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||
|
voting='soft',
|
||
|
flatten_transform=True).fit(X, y)
|
||
|
eclf3 = VotingClassifier(estimators=[
|
||
|
('lr', clf1), ('rf', clf2), ('gnb', clf3)],
|
||
|
voting='soft',
|
||
|
flatten_transform=False).fit(X, y)
|
||
|
|
||
|
warn_msg = ("'flatten_transform' default value will be "
|
||
|
"changed to True in 0.21."
|
||
|
"To silence this warning you may"
|
||
|
" explicitly set flatten_transform=False.")
|
||
|
res = assert_warns_message(DeprecationWarning, warn_msg,
|
||
|
eclf1.transform, X)
|
||
|
assert_array_equal(res.shape, (3, 4, 2))
|
||
|
assert_array_equal(eclf2.transform(X).shape, (4, 6))
|
||
|
assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
|
||
|
assert_array_equal(res.swapaxes(0, 1).reshape((4, 6)),
|
||
|
eclf2.transform(X))
|
||
|
assert_array_equal(eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)),
|
||
|
eclf2.transform(X))
|