587 lines
21 KiB
Python
587 lines
21 KiB
Python
|
import pickle
|
||
|
from io import BytesIO
|
||
|
import numpy as np
|
||
|
import scipy.sparse
|
||
|
|
||
|
from sklearn.datasets import load_digits, load_iris
|
||
|
|
||
|
from sklearn.model_selection import train_test_split
|
||
|
from sklearn.model_selection import cross_val_score
|
||
|
|
||
|
from sklearn.externals.six.moves import zip
|
||
|
from sklearn.utils.testing import assert_almost_equal
|
||
|
from sklearn.utils.testing import assert_array_equal
|
||
|
from sklearn.utils.testing import assert_array_almost_equal
|
||
|
from sklearn.utils.testing import assert_equal
|
||
|
from sklearn.utils.testing import assert_raises
|
||
|
from sklearn.utils.testing import assert_raise_message
|
||
|
from sklearn.utils.testing import assert_greater
|
||
|
from sklearn.utils.testing import assert_warns
|
||
|
|
||
|
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
|
||
|
|
||
|
# Data is just 6 separable points in the plane
|
||
|
X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
|
||
|
y = np.array([1, 1, 1, 2, 2, 2])
|
||
|
|
||
|
# A bit more random tests
|
||
|
rng = np.random.RandomState(0)
|
||
|
X1 = rng.normal(size=(10, 3))
|
||
|
y1 = (rng.normal(size=(10)) > 0).astype(np.int)
|
||
|
|
||
|
# Data is 6 random integer points in a 100 dimensional space classified to
|
||
|
# three classes.
|
||
|
X2 = rng.randint(5, size=(6, 100))
|
||
|
y2 = np.array([1, 1, 2, 2, 3, 3])
|
||
|
|
||
|
|
||
|
def test_gnb():
|
||
|
# Gaussian Naive Bayes classification.
|
||
|
# This checks that GaussianNB implements fit and predict and returns
|
||
|
# correct values for a simple toy dataset.
|
||
|
|
||
|
clf = GaussianNB()
|
||
|
y_pred = clf.fit(X, y).predict(X)
|
||
|
assert_array_equal(y_pred, y)
|
||
|
|
||
|
y_pred_proba = clf.predict_proba(X)
|
||
|
y_pred_log_proba = clf.predict_log_proba(X)
|
||
|
assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
|
||
|
|
||
|
# Test whether label mismatch between target y and classes raises
|
||
|
# an Error
|
||
|
# FIXME Remove this test once the more general partial_fit tests are merged
|
||
|
assert_raises(ValueError, GaussianNB().partial_fit, X, y, classes=[0, 1])
|
||
|
|
||
|
|
||
|
def test_gnb_prior():
|
||
|
# Test whether class priors are properly set.
|
||
|
clf = GaussianNB().fit(X, y)
|
||
|
assert_array_almost_equal(np.array([3, 3]) / 6.0,
|
||
|
clf.class_prior_, 8)
|
||
|
clf.fit(X1, y1)
|
||
|
# Check that the class priors sum to 1
|
||
|
assert_array_almost_equal(clf.class_prior_.sum(), 1)
|
||
|
|
||
|
|
||
|
def test_gnb_sample_weight():
|
||
|
"""Test whether sample weights are properly used in GNB. """
|
||
|
# Sample weights all being 1 should not change results
|
||
|
sw = np.ones(6)
|
||
|
clf = GaussianNB().fit(X, y)
|
||
|
clf_sw = GaussianNB().fit(X, y, sw)
|
||
|
|
||
|
assert_array_almost_equal(clf.theta_, clf_sw.theta_)
|
||
|
assert_array_almost_equal(clf.sigma_, clf_sw.sigma_)
|
||
|
|
||
|
# Fitting twice with half sample-weights should result
|
||
|
# in same result as fitting once with full weights
|
||
|
sw = rng.rand(y.shape[0])
|
||
|
clf1 = GaussianNB().fit(X, y, sample_weight=sw)
|
||
|
clf2 = GaussianNB().partial_fit(X, y, classes=[1, 2], sample_weight=sw / 2)
|
||
|
clf2.partial_fit(X, y, sample_weight=sw / 2)
|
||
|
|
||
|
assert_array_almost_equal(clf1.theta_, clf2.theta_)
|
||
|
assert_array_almost_equal(clf1.sigma_, clf2.sigma_)
|
||
|
|
||
|
# Check that duplicate entries and correspondingly increased sample
|
||
|
# weights yield the same result
|
||
|
ind = rng.randint(0, X.shape[0], 20)
|
||
|
sample_weight = np.bincount(ind, minlength=X.shape[0])
|
||
|
|
||
|
clf_dupl = GaussianNB().fit(X[ind], y[ind])
|
||
|
clf_sw = GaussianNB().fit(X, y, sample_weight)
|
||
|
|
||
|
assert_array_almost_equal(clf_dupl.theta_, clf_sw.theta_)
|
||
|
assert_array_almost_equal(clf_dupl.sigma_, clf_sw.sigma_)
|
||
|
|
||
|
|
||
|
def test_gnb_neg_priors():
|
||
|
"""Test whether an error is raised in case of negative priors"""
|
||
|
clf = GaussianNB(priors=np.array([-1., 2.]))
|
||
|
assert_raises(ValueError, clf.fit, X, y)
|
||
|
|
||
|
|
||
|
def test_gnb_priors():
|
||
|
"""Test whether the class prior override is properly used"""
|
||
|
clf = GaussianNB(priors=np.array([0.3, 0.7])).fit(X, y)
|
||
|
assert_array_almost_equal(clf.predict_proba([[-0.1, -0.1]]),
|
||
|
np.array([[0.825303662161683,
|
||
|
0.174696337838317]]), 8)
|
||
|
assert_array_equal(clf.class_prior_, np.array([0.3, 0.7]))
|
||
|
|
||
|
|
||
|
def test_gnb_wrong_nb_priors():
|
||
|
""" Test whether an error is raised if the number of prior is different
|
||
|
from the number of class"""
|
||
|
clf = GaussianNB(priors=np.array([.25, .25, .25, .25]))
|
||
|
assert_raises(ValueError, clf.fit, X, y)
|
||
|
|
||
|
|
||
|
def test_gnb_prior_greater_one():
|
||
|
"""Test if an error is raised if the sum of prior greater than one"""
|
||
|
clf = GaussianNB(priors=np.array([2., 1.]))
|
||
|
assert_raises(ValueError, clf.fit, X, y)
|
||
|
|
||
|
|
||
|
def test_gnb_prior_large_bias():
|
||
|
"""Test if good prediction when class prior favor largely one class"""
|
||
|
clf = GaussianNB(priors=np.array([0.01, 0.99]))
|
||
|
clf.fit(X, y)
|
||
|
assert_equal(clf.predict([[-0.1, -0.1]]), np.array([2]))
|
||
|
|
||
|
|
||
|
def test_check_update_with_no_data():
|
||
|
""" Test when the partial fit is called without any data"""
|
||
|
# Create an empty array
|
||
|
prev_points = 100
|
||
|
mean = 0.
|
||
|
var = 1.
|
||
|
x_empty = np.empty((0, X.shape[1]))
|
||
|
tmean, tvar = GaussianNB._update_mean_variance(prev_points, mean,
|
||
|
var, x_empty)
|
||
|
assert_equal(tmean, mean)
|
||
|
assert_equal(tvar, var)
|
||
|
|
||
|
|
||
|
def test_gnb_pfit_wrong_nb_features():
|
||
|
"""Test whether an error is raised when the number of feature changes
|
||
|
between two partial fit"""
|
||
|
clf = GaussianNB()
|
||
|
# Fit for the first time the GNB
|
||
|
clf.fit(X, y)
|
||
|
# Partial fit a second time with an incoherent X
|
||
|
assert_raises(ValueError, clf.partial_fit, np.hstack((X, X)), y)
|
||
|
|
||
|
|
||
|
def test_discrete_prior():
|
||
|
# Test whether class priors are properly set.
|
||
|
for cls in [BernoulliNB, MultinomialNB]:
|
||
|
clf = cls().fit(X2, y2)
|
||
|
assert_array_almost_equal(np.log(np.array([2, 2, 2]) / 6.0),
|
||
|
clf.class_log_prior_, 8)
|
||
|
|
||
|
|
||
|
def test_mnnb():
|
||
|
# Test Multinomial Naive Bayes classification.
|
||
|
# This checks that MultinomialNB implements fit and predict and returns
|
||
|
# correct values for a simple toy dataset.
|
||
|
|
||
|
for X in [X2, scipy.sparse.csr_matrix(X2)]:
|
||
|
# Check the ability to predict the learning set.
|
||
|
clf = MultinomialNB()
|
||
|
assert_raises(ValueError, clf.fit, -X, y2)
|
||
|
y_pred = clf.fit(X, y2).predict(X)
|
||
|
|
||
|
assert_array_equal(y_pred, y2)
|
||
|
|
||
|
# Verify that np.log(clf.predict_proba(X)) gives the same results as
|
||
|
# clf.predict_log_proba(X)
|
||
|
y_pred_proba = clf.predict_proba(X)
|
||
|
y_pred_log_proba = clf.predict_log_proba(X)
|
||
|
assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
|
||
|
|
||
|
# Check that incremental fitting yields the same results
|
||
|
clf2 = MultinomialNB()
|
||
|
clf2.partial_fit(X[:2], y2[:2], classes=np.unique(y2))
|
||
|
clf2.partial_fit(X[2:5], y2[2:5])
|
||
|
clf2.partial_fit(X[5:], y2[5:])
|
||
|
|
||
|
y_pred2 = clf2.predict(X)
|
||
|
assert_array_equal(y_pred2, y2)
|
||
|
|
||
|
y_pred_proba2 = clf2.predict_proba(X)
|
||
|
y_pred_log_proba2 = clf2.predict_log_proba(X)
|
||
|
assert_array_almost_equal(np.log(y_pred_proba2), y_pred_log_proba2, 8)
|
||
|
assert_array_almost_equal(y_pred_proba2, y_pred_proba)
|
||
|
assert_array_almost_equal(y_pred_log_proba2, y_pred_log_proba)
|
||
|
|
||
|
# Partial fit on the whole data at once should be the same as fit too
|
||
|
clf3 = MultinomialNB()
|
||
|
clf3.partial_fit(X, y2, classes=np.unique(y2))
|
||
|
|
||
|
y_pred3 = clf3.predict(X)
|
||
|
assert_array_equal(y_pred3, y2)
|
||
|
y_pred_proba3 = clf3.predict_proba(X)
|
||
|
y_pred_log_proba3 = clf3.predict_log_proba(X)
|
||
|
assert_array_almost_equal(np.log(y_pred_proba3), y_pred_log_proba3, 8)
|
||
|
assert_array_almost_equal(y_pred_proba3, y_pred_proba)
|
||
|
assert_array_almost_equal(y_pred_log_proba3, y_pred_log_proba)
|
||
|
|
||
|
|
||
|
def check_partial_fit(cls):
|
||
|
clf1 = cls()
|
||
|
clf1.fit([[0, 1], [1, 0]], [0, 1])
|
||
|
|
||
|
clf2 = cls()
|
||
|
clf2.partial_fit([[0, 1], [1, 0]], [0, 1], classes=[0, 1])
|
||
|
assert_array_equal(clf1.class_count_, clf2.class_count_)
|
||
|
assert_array_equal(clf1.feature_count_, clf2.feature_count_)
|
||
|
|
||
|
clf3 = cls()
|
||
|
clf3.partial_fit([[0, 1]], [0], classes=[0, 1])
|
||
|
clf3.partial_fit([[1, 0]], [1])
|
||
|
assert_array_equal(clf1.class_count_, clf3.class_count_)
|
||
|
assert_array_equal(clf1.feature_count_, clf3.feature_count_)
|
||
|
|
||
|
|
||
|
def test_discretenb_partial_fit():
|
||
|
for cls in [MultinomialNB, BernoulliNB]:
|
||
|
yield check_partial_fit, cls
|
||
|
|
||
|
|
||
|
def test_gnb_partial_fit():
|
||
|
clf = GaussianNB().fit(X, y)
|
||
|
clf_pf = GaussianNB().partial_fit(X, y, np.unique(y))
|
||
|
assert_array_almost_equal(clf.theta_, clf_pf.theta_)
|
||
|
assert_array_almost_equal(clf.sigma_, clf_pf.sigma_)
|
||
|
assert_array_almost_equal(clf.class_prior_, clf_pf.class_prior_)
|
||
|
|
||
|
clf_pf2 = GaussianNB().partial_fit(X[0::2, :], y[0::2], np.unique(y))
|
||
|
clf_pf2.partial_fit(X[1::2], y[1::2])
|
||
|
assert_array_almost_equal(clf.theta_, clf_pf2.theta_)
|
||
|
assert_array_almost_equal(clf.sigma_, clf_pf2.sigma_)
|
||
|
assert_array_almost_equal(clf.class_prior_, clf_pf2.class_prior_)
|
||
|
|
||
|
|
||
|
def test_discretenb_pickle():
|
||
|
# Test picklability of discrete naive Bayes classifiers
|
||
|
|
||
|
for cls in [BernoulliNB, MultinomialNB, GaussianNB]:
|
||
|
clf = cls().fit(X2, y2)
|
||
|
y_pred = clf.predict(X2)
|
||
|
|
||
|
store = BytesIO()
|
||
|
pickle.dump(clf, store)
|
||
|
clf = pickle.load(BytesIO(store.getvalue()))
|
||
|
|
||
|
assert_array_equal(y_pred, clf.predict(X2))
|
||
|
|
||
|
if cls is not GaussianNB:
|
||
|
# TODO re-enable me when partial_fit is implemented for GaussianNB
|
||
|
|
||
|
# Test pickling of estimator trained with partial_fit
|
||
|
clf2 = cls().partial_fit(X2[:3], y2[:3], classes=np.unique(y2))
|
||
|
clf2.partial_fit(X2[3:], y2[3:])
|
||
|
store = BytesIO()
|
||
|
pickle.dump(clf2, store)
|
||
|
clf2 = pickle.load(BytesIO(store.getvalue()))
|
||
|
assert_array_equal(y_pred, clf2.predict(X2))
|
||
|
|
||
|
|
||
|
def test_input_check_fit():
|
||
|
# Test input checks for the fit method
|
||
|
for cls in [BernoulliNB, MultinomialNB, GaussianNB]:
|
||
|
# check shape consistency for number of samples at fit time
|
||
|
assert_raises(ValueError, cls().fit, X2, y2[:-1])
|
||
|
|
||
|
# check shape consistency for number of input features at predict time
|
||
|
clf = cls().fit(X2, y2)
|
||
|
assert_raises(ValueError, clf.predict, X2[:, :-1])
|
||
|
|
||
|
|
||
|
def test_input_check_partial_fit():
|
||
|
for cls in [BernoulliNB, MultinomialNB]:
|
||
|
# check shape consistency
|
||
|
assert_raises(ValueError, cls().partial_fit, X2, y2[:-1],
|
||
|
classes=np.unique(y2))
|
||
|
|
||
|
# classes is required for first call to partial fit
|
||
|
assert_raises(ValueError, cls().partial_fit, X2, y2)
|
||
|
|
||
|
# check consistency of consecutive classes values
|
||
|
clf = cls()
|
||
|
clf.partial_fit(X2, y2, classes=np.unique(y2))
|
||
|
assert_raises(ValueError, clf.partial_fit, X2, y2,
|
||
|
classes=np.arange(42))
|
||
|
|
||
|
# check consistency of input shape for partial_fit
|
||
|
assert_raises(ValueError, clf.partial_fit, X2[:, :-1], y2)
|
||
|
|
||
|
# check consistency of input shape for predict
|
||
|
assert_raises(ValueError, clf.predict, X2[:, :-1])
|
||
|
|
||
|
|
||
|
def test_discretenb_predict_proba():
|
||
|
# Test discrete NB classes' probability scores
|
||
|
|
||
|
# The 100s below distinguish Bernoulli from multinomial.
|
||
|
# FIXME: write a test to show this.
|
||
|
X_bernoulli = [[1, 100, 0], [0, 1, 0], [0, 100, 1]]
|
||
|
X_multinomial = [[0, 1], [1, 3], [4, 0]]
|
||
|
|
||
|
# test binary case (1-d output)
|
||
|
y = [0, 0, 2] # 2 is regression test for binary case, 02e673
|
||
|
for cls, X in zip([BernoulliNB, MultinomialNB],
|
||
|
[X_bernoulli, X_multinomial]):
|
||
|
clf = cls().fit(X, y)
|
||
|
assert_equal(clf.predict(X[-1:]), 2)
|
||
|
assert_equal(clf.predict_proba([X[0]]).shape, (1, 2))
|
||
|
assert_array_almost_equal(clf.predict_proba(X[:2]).sum(axis=1),
|
||
|
np.array([1., 1.]), 6)
|
||
|
|
||
|
# test multiclass case (2-d output, must sum to one)
|
||
|
y = [0, 1, 2]
|
||
|
for cls, X in zip([BernoulliNB, MultinomialNB],
|
||
|
[X_bernoulli, X_multinomial]):
|
||
|
clf = cls().fit(X, y)
|
||
|
assert_equal(clf.predict_proba(X[0:1]).shape, (1, 3))
|
||
|
assert_equal(clf.predict_proba(X[:2]).shape, (2, 3))
|
||
|
assert_almost_equal(np.sum(clf.predict_proba([X[1]])), 1)
|
||
|
assert_almost_equal(np.sum(clf.predict_proba([X[-1]])), 1)
|
||
|
assert_almost_equal(np.sum(np.exp(clf.class_log_prior_)), 1)
|
||
|
assert_almost_equal(np.sum(np.exp(clf.intercept_)), 1)
|
||
|
|
||
|
|
||
|
def test_discretenb_uniform_prior():
|
||
|
# Test whether discrete NB classes fit a uniform prior
|
||
|
# when fit_prior=False and class_prior=None
|
||
|
|
||
|
for cls in [BernoulliNB, MultinomialNB]:
|
||
|
clf = cls()
|
||
|
clf.set_params(fit_prior=False)
|
||
|
clf.fit([[0], [0], [1]], [0, 0, 1])
|
||
|
prior = np.exp(clf.class_log_prior_)
|
||
|
assert_array_equal(prior, np.array([.5, .5]))
|
||
|
|
||
|
|
||
|
def test_discretenb_provide_prior():
|
||
|
# Test whether discrete NB classes use provided prior
|
||
|
|
||
|
for cls in [BernoulliNB, MultinomialNB]:
|
||
|
clf = cls(class_prior=[0.5, 0.5])
|
||
|
clf.fit([[0], [0], [1]], [0, 0, 1])
|
||
|
prior = np.exp(clf.class_log_prior_)
|
||
|
assert_array_equal(prior, np.array([.5, .5]))
|
||
|
|
||
|
# Inconsistent number of classes with prior
|
||
|
assert_raises(ValueError, clf.fit, [[0], [1], [2]], [0, 1, 2])
|
||
|
assert_raises(ValueError, clf.partial_fit, [[0], [1]], [0, 1],
|
||
|
classes=[0, 1, 1])
|
||
|
|
||
|
|
||
|
def test_discretenb_provide_prior_with_partial_fit():
|
||
|
# Test whether discrete NB classes use provided prior
|
||
|
# when using partial_fit
|
||
|
|
||
|
iris = load_iris()
|
||
|
iris_data1, iris_data2, iris_target1, iris_target2 = train_test_split(
|
||
|
iris.data, iris.target, test_size=0.4, random_state=415)
|
||
|
|
||
|
for cls in [BernoulliNB, MultinomialNB]:
|
||
|
for prior in [None, [0.3, 0.3, 0.4]]:
|
||
|
clf_full = cls(class_prior=prior)
|
||
|
clf_full.fit(iris.data, iris.target)
|
||
|
clf_partial = cls(class_prior=prior)
|
||
|
clf_partial.partial_fit(iris_data1, iris_target1,
|
||
|
classes=[0, 1, 2])
|
||
|
clf_partial.partial_fit(iris_data2, iris_target2)
|
||
|
assert_array_almost_equal(clf_full.class_log_prior_,
|
||
|
clf_partial.class_log_prior_)
|
||
|
|
||
|
|
||
|
def test_sample_weight_multiclass():
|
||
|
for cls in [BernoulliNB, MultinomialNB]:
|
||
|
# check shape consistency for number of samples at fit time
|
||
|
yield check_sample_weight_multiclass, cls
|
||
|
|
||
|
|
||
|
def check_sample_weight_multiclass(cls):
|
||
|
X = [
|
||
|
[0, 0, 1],
|
||
|
[0, 1, 1],
|
||
|
[0, 1, 1],
|
||
|
[1, 0, 0],
|
||
|
]
|
||
|
y = [0, 0, 1, 2]
|
||
|
sample_weight = np.array([1, 1, 2, 2], dtype=np.float64)
|
||
|
sample_weight /= sample_weight.sum()
|
||
|
clf = cls().fit(X, y, sample_weight=sample_weight)
|
||
|
assert_array_equal(clf.predict(X), [0, 1, 1, 2])
|
||
|
|
||
|
# Check sample weight using the partial_fit method
|
||
|
clf = cls()
|
||
|
clf.partial_fit(X[:2], y[:2], classes=[0, 1, 2],
|
||
|
sample_weight=sample_weight[:2])
|
||
|
clf.partial_fit(X[2:3], y[2:3], sample_weight=sample_weight[2:3])
|
||
|
clf.partial_fit(X[3:], y[3:], sample_weight=sample_weight[3:])
|
||
|
assert_array_equal(clf.predict(X), [0, 1, 1, 2])
|
||
|
|
||
|
|
||
|
def test_sample_weight_mnb():
|
||
|
clf = MultinomialNB()
|
||
|
clf.fit([[1, 2], [1, 2], [1, 0]],
|
||
|
[0, 0, 1],
|
||
|
sample_weight=[1, 1, 4])
|
||
|
assert_array_equal(clf.predict([[1, 0]]), [1])
|
||
|
positive_prior = np.exp(clf.intercept_[0])
|
||
|
assert_array_almost_equal([1 - positive_prior, positive_prior],
|
||
|
[1 / 3., 2 / 3.])
|
||
|
|
||
|
|
||
|
def test_coef_intercept_shape():
|
||
|
# coef_ and intercept_ should have shapes as in other linear models.
|
||
|
# Non-regression test for issue #2127.
|
||
|
X = [[1, 0, 0], [1, 1, 1]]
|
||
|
y = [1, 2] # binary classification
|
||
|
|
||
|
for clf in [MultinomialNB(), BernoulliNB()]:
|
||
|
clf.fit(X, y)
|
||
|
assert_equal(clf.coef_.shape, (1, 3))
|
||
|
assert_equal(clf.intercept_.shape, (1,))
|
||
|
|
||
|
|
||
|
def test_check_accuracy_on_digits():
|
||
|
# Non regression test to make sure that any further refactoring / optim
|
||
|
# of the NB models do not harm the performance on a slightly non-linearly
|
||
|
# separable dataset
|
||
|
digits = load_digits()
|
||
|
X, y = digits.data, digits.target
|
||
|
binary_3v8 = np.logical_or(digits.target == 3, digits.target == 8)
|
||
|
X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8]
|
||
|
|
||
|
# Multinomial NB
|
||
|
scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10)
|
||
|
assert_greater(scores.mean(), 0.86)
|
||
|
|
||
|
scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10)
|
||
|
assert_greater(scores.mean(), 0.94)
|
||
|
|
||
|
# Bernoulli NB
|
||
|
scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10)
|
||
|
assert_greater(scores.mean(), 0.83)
|
||
|
|
||
|
scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10)
|
||
|
assert_greater(scores.mean(), 0.92)
|
||
|
|
||
|
# Gaussian NB
|
||
|
scores = cross_val_score(GaussianNB(), X, y, cv=10)
|
||
|
assert_greater(scores.mean(), 0.77)
|
||
|
|
||
|
scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
|
||
|
assert_greater(scores.mean(), 0.86)
|
||
|
|
||
|
|
||
|
def test_feature_log_prob_bnb():
|
||
|
# Test for issue #4268.
|
||
|
# Tests that the feature log prob value computed by BernoulliNB when
|
||
|
# alpha=1.0 is equal to the expression given in Manning, Raghavan,
|
||
|
# and Schuetze's "Introduction to Information Retrieval" book:
|
||
|
# http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
|
||
|
|
||
|
X = np.array([[0, 0, 0], [1, 1, 0], [0, 1, 0], [1, 0, 1], [0, 1, 0]])
|
||
|
Y = np.array([0, 0, 1, 2, 2])
|
||
|
|
||
|
# Fit Bernoulli NB w/ alpha = 1.0
|
||
|
clf = BernoulliNB(alpha=1.0)
|
||
|
clf.fit(X, Y)
|
||
|
|
||
|
# Manually form the (log) numerator and denominator that
|
||
|
# constitute P(feature presence | class)
|
||
|
num = np.log(clf.feature_count_ + 1.0)
|
||
|
denom = np.tile(np.log(clf.class_count_ + 2.0), (X.shape[1], 1)).T
|
||
|
|
||
|
# Check manual estimate matches
|
||
|
assert_array_almost_equal(clf.feature_log_prob_, (num - denom))
|
||
|
|
||
|
|
||
|
def test_bnb():
|
||
|
# Tests that BernoulliNB when alpha=1.0 gives the same values as
|
||
|
# those given for the toy example in Manning, Raghavan, and
|
||
|
# Schuetze's "Introduction to Information Retrieval" book:
|
||
|
# http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html
|
||
|
|
||
|
# Training data points are:
|
||
|
# Chinese Beijing Chinese (class: China)
|
||
|
# Chinese Chinese Shanghai (class: China)
|
||
|
# Chinese Macao (class: China)
|
||
|
# Tokyo Japan Chinese (class: Japan)
|
||
|
|
||
|
# Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo
|
||
|
X = np.array([[1, 1, 0, 0, 0, 0],
|
||
|
[0, 1, 0, 0, 1, 0],
|
||
|
[0, 1, 0, 1, 0, 0],
|
||
|
[0, 1, 1, 0, 0, 1]])
|
||
|
|
||
|
# Classes are China (0), Japan (1)
|
||
|
Y = np.array([0, 0, 0, 1])
|
||
|
|
||
|
# Fit BernoulliBN w/ alpha = 1.0
|
||
|
clf = BernoulliNB(alpha=1.0)
|
||
|
clf.fit(X, Y)
|
||
|
|
||
|
# Check the class prior is correct
|
||
|
class_prior = np.array([0.75, 0.25])
|
||
|
assert_array_almost_equal(np.exp(clf.class_log_prior_), class_prior)
|
||
|
|
||
|
# Check the feature probabilities are correct
|
||
|
feature_prob = np.array([[0.4, 0.8, 0.2, 0.4, 0.4, 0.2],
|
||
|
[1/3.0, 2/3.0, 2/3.0, 1/3.0, 1/3.0, 2/3.0]])
|
||
|
assert_array_almost_equal(np.exp(clf.feature_log_prob_), feature_prob)
|
||
|
|
||
|
# Testing data point is:
|
||
|
# Chinese Chinese Chinese Tokyo Japan
|
||
|
X_test = np.array([[0, 1, 1, 0, 0, 1]])
|
||
|
|
||
|
# Check the predictive probabilities are correct
|
||
|
unnorm_predict_proba = np.array([[0.005183999999999999,
|
||
|
0.02194787379972565]])
|
||
|
predict_proba = unnorm_predict_proba / np.sum(unnorm_predict_proba)
|
||
|
assert_array_almost_equal(clf.predict_proba(X_test), predict_proba)
|
||
|
|
||
|
|
||
|
def test_naive_bayes_scale_invariance():
|
||
|
# Scaling the data should not change the prediction results
|
||
|
iris = load_iris()
|
||
|
X, y = iris.data, iris.target
|
||
|
labels = [GaussianNB().fit(f * X, y).predict(f * X)
|
||
|
for f in [1E-10, 1, 1E10]]
|
||
|
assert_array_equal(labels[0], labels[1])
|
||
|
assert_array_equal(labels[1], labels[2])
|
||
|
|
||
|
|
||
|
def test_alpha():
|
||
|
# Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case
|
||
|
X = np.array([[1, 0], [1, 1]])
|
||
|
y = np.array([0, 1])
|
||
|
nb = BernoulliNB(alpha=0.)
|
||
|
assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1])
|
||
|
assert_warns(UserWarning, nb.fit, X, y)
|
||
|
prob = np.array([[1, 0], [0, 1]])
|
||
|
assert_array_almost_equal(nb.predict_proba(X), prob)
|
||
|
|
||
|
nb = MultinomialNB(alpha=0.)
|
||
|
assert_warns(UserWarning, nb.partial_fit, X, y, classes=[0, 1])
|
||
|
assert_warns(UserWarning, nb.fit, X, y)
|
||
|
prob = np.array([[2./3, 1./3], [0, 1]])
|
||
|
assert_array_almost_equal(nb.predict_proba(X), prob)
|
||
|
|
||
|
# Test sparse X
|
||
|
X = scipy.sparse.csr_matrix(X)
|
||
|
nb = BernoulliNB(alpha=0.)
|
||
|
assert_warns(UserWarning, nb.fit, X, y)
|
||
|
prob = np.array([[1, 0], [0, 1]])
|
||
|
assert_array_almost_equal(nb.predict_proba(X), prob)
|
||
|
|
||
|
nb = MultinomialNB(alpha=0.)
|
||
|
assert_warns(UserWarning, nb.fit, X, y)
|
||
|
prob = np.array([[2./3, 1./3], [0, 1]])
|
||
|
assert_array_almost_equal(nb.predict_proba(X), prob)
|
||
|
|
||
|
# Test for alpha < 0
|
||
|
X = np.array([[1, 0], [1, 1]])
|
||
|
y = np.array([0, 1])
|
||
|
expected_msg = ('Smoothing parameter alpha = -1.0e-01. '
|
||
|
'alpha should be > 0.')
|
||
|
b_nb = BernoulliNB(alpha=-0.1)
|
||
|
m_nb = MultinomialNB(alpha=-0.1)
|
||
|
assert_raise_message(ValueError, expected_msg, b_nb.fit, X, y)
|
||
|
assert_raise_message(ValueError, expected_msg, m_nb.fit, X, y)
|
||
|
|
||
|
b_nb = BernoulliNB(alpha=-0.1)
|
||
|
m_nb = MultinomialNB(alpha=-0.1)
|
||
|
assert_raise_message(ValueError, expected_msg, b_nb.partial_fit,
|
||
|
X, y, classes=[0, 1])
|
||
|
assert_raise_message(ValueError, expected_msg, m_nb.partial_fit,
|
||
|
X, y, classes=[0, 1])
|