1028 lines
42 KiB
Python
1028 lines
42 KiB
Python
from __future__ import division, print_function
|
|
|
|
import numpy as np
|
|
from itertools import product
|
|
import warnings
|
|
from scipy.sparse import csr_matrix
|
|
|
|
from sklearn import datasets
|
|
from sklearn import svm
|
|
|
|
from sklearn.datasets import make_multilabel_classification
|
|
from sklearn.random_projection import sparse_random_matrix
|
|
from sklearn.utils.validation import check_array, check_consistent_length
|
|
from sklearn.utils.validation import check_random_state
|
|
|
|
from sklearn.utils.testing import assert_raises, clean_warning_registry
|
|
from sklearn.utils.testing import assert_raise_message
|
|
from sklearn.utils.testing import assert_equal
|
|
from sklearn.utils.testing import assert_almost_equal
|
|
from sklearn.utils.testing import assert_array_equal
|
|
from sklearn.utils.testing import assert_array_almost_equal
|
|
from sklearn.utils.testing import assert_warns
|
|
|
|
from sklearn.metrics import auc
|
|
from sklearn.metrics import average_precision_score
|
|
from sklearn.metrics import coverage_error
|
|
from sklearn.metrics import label_ranking_average_precision_score
|
|
from sklearn.metrics import precision_recall_curve
|
|
from sklearn.metrics import label_ranking_loss
|
|
from sklearn.metrics import roc_auc_score
|
|
from sklearn.metrics import roc_curve
|
|
|
|
from sklearn.exceptions import UndefinedMetricWarning
|
|
|
|
|
|
###############################################################################
|
|
# Utilities for testing
|
|
|
|
def make_prediction(dataset=None, binary=False):
|
|
"""Make some classification predictions on a toy dataset using a SVC
|
|
|
|
If binary is True restrict to a binary classification problem instead of a
|
|
multiclass classification problem
|
|
"""
|
|
|
|
if dataset is None:
|
|
# import some data to play with
|
|
dataset = datasets.load_iris()
|
|
|
|
X = dataset.data
|
|
y = dataset.target
|
|
|
|
if binary:
|
|
# restrict to a binary classification task
|
|
X, y = X[y < 2], y[y < 2]
|
|
|
|
n_samples, n_features = X.shape
|
|
p = np.arange(n_samples)
|
|
|
|
rng = check_random_state(37)
|
|
rng.shuffle(p)
|
|
X, y = X[p], y[p]
|
|
half = int(n_samples / 2)
|
|
|
|
# add noisy features to make the problem harder and avoid perfect results
|
|
rng = np.random.RandomState(0)
|
|
X = np.c_[X, rng.randn(n_samples, 200 * n_features)]
|
|
|
|
# run classifier, get class probabilities and label predictions
|
|
clf = svm.SVC(kernel='linear', probability=True, random_state=0)
|
|
probas_pred = clf.fit(X[:half], y[:half]).predict_proba(X[half:])
|
|
|
|
if binary:
|
|
# only interested in probabilities of the positive case
|
|
# XXX: do we really want a special API for the binary case?
|
|
probas_pred = probas_pred[:, 1]
|
|
|
|
y_pred = clf.predict(X[half:])
|
|
y_true = y[half:]
|
|
return y_true, y_pred, probas_pred
|
|
|
|
|
|
###############################################################################
|
|
# Tests
|
|
|
|
def _auc(y_true, y_score):
|
|
"""Alternative implementation to check for correctness of
|
|
`roc_auc_score`."""
|
|
pos_label = np.unique(y_true)[1]
|
|
|
|
# Count the number of times positive samples are correctly ranked above
|
|
# negative samples.
|
|
pos = y_score[y_true == pos_label]
|
|
neg = y_score[y_true != pos_label]
|
|
diff_matrix = pos.reshape(1, -1) - neg.reshape(-1, 1)
|
|
n_correct = np.sum(diff_matrix > 0)
|
|
|
|
return n_correct / float(len(pos) * len(neg))
|
|
|
|
|
|
def _average_precision(y_true, y_score):
|
|
"""Alternative implementation to check for correctness of
|
|
`average_precision_score`.
|
|
|
|
Note that this implementation fails on some edge cases.
|
|
For example, for constant predictions e.g. [0.5, 0.5, 0.5],
|
|
y_true = [1, 0, 0] returns an average precision of 0.33...
|
|
but y_true = [0, 0, 1] returns 1.0.
|
|
"""
|
|
pos_label = np.unique(y_true)[1]
|
|
n_pos = np.sum(y_true == pos_label)
|
|
order = np.argsort(y_score)[::-1]
|
|
y_score = y_score[order]
|
|
y_true = y_true[order]
|
|
|
|
score = 0
|
|
for i in range(len(y_score)):
|
|
if y_true[i] == pos_label:
|
|
# Compute precision up to document i
|
|
# i.e, percentage of relevant documents up to document i.
|
|
prec = 0
|
|
for j in range(0, i + 1):
|
|
if y_true[j] == pos_label:
|
|
prec += 1.0
|
|
prec /= (i + 1.0)
|
|
score += prec
|
|
|
|
return score / n_pos
|
|
|
|
|
|
def _average_precision_slow(y_true, y_score):
|
|
"""A second alternative implementation of average precision that closely
|
|
follows the Wikipedia article's definition (see References). This should
|
|
give identical results as `average_precision_score` for all inputs.
|
|
|
|
References
|
|
----------
|
|
.. [1] `Wikipedia entry for the Average precision
|
|
<http://en.wikipedia.org/wiki/Average_precision>`_
|
|
"""
|
|
precision, recall, threshold = precision_recall_curve(y_true, y_score)
|
|
precision = list(reversed(precision))
|
|
recall = list(reversed(recall))
|
|
average_precision = 0
|
|
for i in range(1, len(precision)):
|
|
average_precision += precision[i] * (recall[i] - recall[i - 1])
|
|
return average_precision
|
|
|
|
|
|
def test_roc_curve():
|
|
# Test Area under Receiver Operating Characteristic (ROC) curve
|
|
y_true, _, probas_pred = make_prediction(binary=True)
|
|
expected_auc = _auc(y_true, probas_pred)
|
|
|
|
for drop in [True, False]:
|
|
fpr, tpr, thresholds = roc_curve(y_true, probas_pred,
|
|
drop_intermediate=drop)
|
|
roc_auc = auc(fpr, tpr)
|
|
assert_array_almost_equal(roc_auc, expected_auc, decimal=2)
|
|
assert_almost_equal(roc_auc, roc_auc_score(y_true, probas_pred))
|
|
assert_equal(fpr.shape, tpr.shape)
|
|
assert_equal(fpr.shape, thresholds.shape)
|
|
|
|
|
|
def test_roc_curve_end_points():
|
|
# Make sure that roc_curve returns a curve start at 0 and ending and
|
|
# 1 even in corner cases
|
|
rng = np.random.RandomState(0)
|
|
y_true = np.array([0] * 50 + [1] * 50)
|
|
y_pred = rng.randint(3, size=100)
|
|
fpr, tpr, thr = roc_curve(y_true, y_pred, drop_intermediate=True)
|
|
assert_equal(fpr[0], 0)
|
|
assert_equal(fpr[-1], 1)
|
|
assert_equal(fpr.shape, tpr.shape)
|
|
assert_equal(fpr.shape, thr.shape)
|
|
|
|
|
|
def test_roc_returns_consistency():
|
|
# Test whether the returned threshold matches up with tpr
|
|
# make small toy dataset
|
|
y_true, _, probas_pred = make_prediction(binary=True)
|
|
fpr, tpr, thresholds = roc_curve(y_true, probas_pred)
|
|
|
|
# use the given thresholds to determine the tpr
|
|
tpr_correct = []
|
|
for t in thresholds:
|
|
tp = np.sum((probas_pred >= t) & y_true)
|
|
p = np.sum(y_true)
|
|
tpr_correct.append(1.0 * tp / p)
|
|
|
|
# compare tpr and tpr_correct to see if the thresholds' order was correct
|
|
assert_array_almost_equal(tpr, tpr_correct, decimal=2)
|
|
assert_equal(fpr.shape, tpr.shape)
|
|
assert_equal(fpr.shape, thresholds.shape)
|
|
|
|
|
|
def test_roc_curve_multi():
|
|
# roc_curve not applicable for multi-class problems
|
|
y_true, _, probas_pred = make_prediction(binary=False)
|
|
|
|
assert_raises(ValueError, roc_curve, y_true, probas_pred)
|
|
|
|
|
|
def test_roc_curve_confidence():
|
|
# roc_curve for confidence scores
|
|
y_true, _, probas_pred = make_prediction(binary=True)
|
|
|
|
fpr, tpr, thresholds = roc_curve(y_true, probas_pred - 0.5)
|
|
roc_auc = auc(fpr, tpr)
|
|
assert_array_almost_equal(roc_auc, 0.90, decimal=2)
|
|
assert_equal(fpr.shape, tpr.shape)
|
|
assert_equal(fpr.shape, thresholds.shape)
|
|
|
|
|
|
def test_roc_curve_hard():
|
|
# roc_curve for hard decisions
|
|
y_true, pred, probas_pred = make_prediction(binary=True)
|
|
|
|
# always predict one
|
|
trivial_pred = np.ones(y_true.shape)
|
|
fpr, tpr, thresholds = roc_curve(y_true, trivial_pred)
|
|
roc_auc = auc(fpr, tpr)
|
|
assert_array_almost_equal(roc_auc, 0.50, decimal=2)
|
|
assert_equal(fpr.shape, tpr.shape)
|
|
assert_equal(fpr.shape, thresholds.shape)
|
|
|
|
# always predict zero
|
|
trivial_pred = np.zeros(y_true.shape)
|
|
fpr, tpr, thresholds = roc_curve(y_true, trivial_pred)
|
|
roc_auc = auc(fpr, tpr)
|
|
assert_array_almost_equal(roc_auc, 0.50, decimal=2)
|
|
assert_equal(fpr.shape, tpr.shape)
|
|
assert_equal(fpr.shape, thresholds.shape)
|
|
|
|
# hard decisions
|
|
fpr, tpr, thresholds = roc_curve(y_true, pred)
|
|
roc_auc = auc(fpr, tpr)
|
|
assert_array_almost_equal(roc_auc, 0.78, decimal=2)
|
|
assert_equal(fpr.shape, tpr.shape)
|
|
assert_equal(fpr.shape, thresholds.shape)
|
|
|
|
|
|
def test_roc_curve_one_label():
|
|
y_true = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
|
|
y_pred = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
# assert there are warnings
|
|
w = UndefinedMetricWarning
|
|
fpr, tpr, thresholds = assert_warns(w, roc_curve, y_true, y_pred)
|
|
# all true labels, all fpr should be nan
|
|
assert_array_equal(fpr,
|
|
np.nan * np.ones(len(thresholds)))
|
|
assert_equal(fpr.shape, tpr.shape)
|
|
assert_equal(fpr.shape, thresholds.shape)
|
|
|
|
# assert there are warnings
|
|
fpr, tpr, thresholds = assert_warns(w, roc_curve,
|
|
[1 - x for x in y_true],
|
|
y_pred)
|
|
# all negative labels, all tpr should be nan
|
|
assert_array_equal(tpr,
|
|
np.nan * np.ones(len(thresholds)))
|
|
assert_equal(fpr.shape, tpr.shape)
|
|
assert_equal(fpr.shape, thresholds.shape)
|
|
|
|
|
|
def test_roc_curve_toydata():
|
|
# Binary classification
|
|
y_true = [0, 1]
|
|
y_score = [0, 1]
|
|
tpr, fpr, _ = roc_curve(y_true, y_score)
|
|
roc_auc = roc_auc_score(y_true, y_score)
|
|
assert_array_almost_equal(tpr, [0, 1])
|
|
assert_array_almost_equal(fpr, [1, 1])
|
|
assert_almost_equal(roc_auc, 1.)
|
|
|
|
y_true = [0, 1]
|
|
y_score = [1, 0]
|
|
tpr, fpr, _ = roc_curve(y_true, y_score)
|
|
roc_auc = roc_auc_score(y_true, y_score)
|
|
assert_array_almost_equal(tpr, [0, 1, 1])
|
|
assert_array_almost_equal(fpr, [0, 0, 1])
|
|
assert_almost_equal(roc_auc, 0.)
|
|
|
|
y_true = [1, 0]
|
|
y_score = [1, 1]
|
|
tpr, fpr, _ = roc_curve(y_true, y_score)
|
|
roc_auc = roc_auc_score(y_true, y_score)
|
|
assert_array_almost_equal(tpr, [0, 1])
|
|
assert_array_almost_equal(fpr, [0, 1])
|
|
assert_almost_equal(roc_auc, 0.5)
|
|
|
|
y_true = [1, 0]
|
|
y_score = [1, 0]
|
|
tpr, fpr, _ = roc_curve(y_true, y_score)
|
|
roc_auc = roc_auc_score(y_true, y_score)
|
|
assert_array_almost_equal(tpr, [0, 1])
|
|
assert_array_almost_equal(fpr, [1, 1])
|
|
assert_almost_equal(roc_auc, 1.)
|
|
|
|
y_true = [1, 0]
|
|
y_score = [0.5, 0.5]
|
|
tpr, fpr, _ = roc_curve(y_true, y_score)
|
|
roc_auc = roc_auc_score(y_true, y_score)
|
|
assert_array_almost_equal(tpr, [0, 1])
|
|
assert_array_almost_equal(fpr, [0, 1])
|
|
assert_almost_equal(roc_auc, .5)
|
|
|
|
y_true = [0, 0]
|
|
y_score = [0.25, 0.75]
|
|
# assert UndefinedMetricWarning because of no positive sample in y_true
|
|
tpr, fpr, _ = assert_warns(UndefinedMetricWarning, roc_curve, y_true, y_score)
|
|
assert_raises(ValueError, roc_auc_score, y_true, y_score)
|
|
assert_array_almost_equal(tpr, [0., 0.5, 1.])
|
|
assert_array_almost_equal(fpr, [np.nan, np.nan, np.nan])
|
|
|
|
y_true = [1, 1]
|
|
y_score = [0.25, 0.75]
|
|
# assert UndefinedMetricWarning because of no negative sample in y_true
|
|
tpr, fpr, _ = assert_warns(UndefinedMetricWarning, roc_curve, y_true, y_score)
|
|
assert_raises(ValueError, roc_auc_score, y_true, y_score)
|
|
assert_array_almost_equal(tpr, [np.nan, np.nan])
|
|
assert_array_almost_equal(fpr, [0.5, 1.])
|
|
|
|
# Multi-label classification task
|
|
y_true = np.array([[0, 1], [0, 1]])
|
|
y_score = np.array([[0, 1], [0, 1]])
|
|
assert_raises(ValueError, roc_auc_score, y_true, y_score, average="macro")
|
|
assert_raises(ValueError, roc_auc_score, y_true, y_score,
|
|
average="weighted")
|
|
assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 1.)
|
|
assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 1.)
|
|
|
|
y_true = np.array([[0, 1], [0, 1]])
|
|
y_score = np.array([[0, 1], [1, 0]])
|
|
assert_raises(ValueError, roc_auc_score, y_true, y_score, average="macro")
|
|
assert_raises(ValueError, roc_auc_score, y_true, y_score,
|
|
average="weighted")
|
|
assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0.5)
|
|
assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0.5)
|
|
|
|
y_true = np.array([[1, 0], [0, 1]])
|
|
y_score = np.array([[0, 1], [1, 0]])
|
|
assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), 0)
|
|
assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), 0)
|
|
assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0)
|
|
assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0)
|
|
|
|
y_true = np.array([[1, 0], [0, 1]])
|
|
y_score = np.array([[0.5, 0.5], [0.5, 0.5]])
|
|
assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), .5)
|
|
assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), .5)
|
|
assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), .5)
|
|
assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), .5)
|
|
|
|
|
|
def test_roc_curve_drop_intermediate():
|
|
# Test that drop_intermediate drops the correct thresholds
|
|
y_true = [0, 0, 0, 0, 1, 1]
|
|
y_score = [0., 0.2, 0.5, 0.6, 0.7, 1.0]
|
|
tpr, fpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=True)
|
|
assert_array_almost_equal(thresholds, [1., 0.7, 0.])
|
|
|
|
# Test dropping thresholds with repeating scores
|
|
y_true = [0, 0, 0, 0, 0, 0, 0,
|
|
1, 1, 1, 1, 1, 1]
|
|
y_score = [0., 0.1, 0.6, 0.6, 0.7, 0.8, 0.9,
|
|
0.6, 0.7, 0.8, 0.9, 0.9, 1.0]
|
|
tpr, fpr, thresholds = roc_curve(y_true, y_score, drop_intermediate=True)
|
|
assert_array_almost_equal(thresholds,
|
|
[1.0, 0.9, 0.7, 0.6, 0.])
|
|
|
|
|
|
def test_auc():
|
|
# Test Area Under Curve (AUC) computation
|
|
x = [0, 1]
|
|
y = [0, 1]
|
|
assert_array_almost_equal(auc(x, y), 0.5)
|
|
x = [1, 0]
|
|
y = [0, 1]
|
|
assert_array_almost_equal(auc(x, y), 0.5)
|
|
x = [1, 0, 0]
|
|
y = [0, 1, 1]
|
|
assert_array_almost_equal(auc(x, y), 0.5)
|
|
x = [0, 1]
|
|
y = [1, 1]
|
|
assert_array_almost_equal(auc(x, y), 1)
|
|
x = [0, 0.5, 1]
|
|
y = [0, 0.5, 1]
|
|
assert_array_almost_equal(auc(x, y), 0.5)
|
|
|
|
|
|
def test_auc_duplicate_values():
|
|
# Test Area Under Curve (AUC) computation with duplicate values
|
|
|
|
# auc() was previously sorting the x and y arrays according to the indices
|
|
# from numpy.argsort(x), which was reordering the tied 0's in this example
|
|
# and resulting in an incorrect area computation. This test detects the
|
|
# error.
|
|
x = [-2.0, 0.0, 0.0, 0.0, 1.0]
|
|
y1 = [2.0, 0.0, 0.5, 1.0, 1.0]
|
|
y2 = [2.0, 1.0, 0.0, 0.5, 1.0]
|
|
y3 = [2.0, 1.0, 0.5, 0.0, 1.0]
|
|
|
|
for y in (y1, y2, y3):
|
|
assert_array_almost_equal(auc(x, y, reorder=True), 3.0)
|
|
|
|
|
|
def test_auc_errors():
|
|
# Incompatible shapes
|
|
assert_raises(ValueError, auc, [0.0, 0.5, 1.0], [0.1, 0.2])
|
|
|
|
# Too few x values
|
|
assert_raises(ValueError, auc, [0.0], [0.1])
|
|
|
|
# x is not in order
|
|
assert_raises(ValueError, auc, [1.0, 0.0, 0.5], [0.0, 0.0, 0.0])
|
|
|
|
|
|
def test_auc_score_non_binary_class():
|
|
# Test that roc_auc_score function returns an error when trying
|
|
# to compute AUC for non-binary class values.
|
|
rng = check_random_state(404)
|
|
y_pred = rng.rand(10)
|
|
# y_true contains only one class value
|
|
y_true = np.zeros(10, dtype="int")
|
|
assert_raise_message(ValueError, "ROC AUC score is not defined",
|
|
roc_auc_score, y_true, y_pred)
|
|
y_true = np.ones(10, dtype="int")
|
|
assert_raise_message(ValueError, "ROC AUC score is not defined",
|
|
roc_auc_score, y_true, y_pred)
|
|
y_true = -np.ones(10, dtype="int")
|
|
assert_raise_message(ValueError, "ROC AUC score is not defined",
|
|
roc_auc_score, y_true, y_pred)
|
|
# y_true contains three different class values
|
|
y_true = rng.randint(0, 3, size=10)
|
|
assert_raise_message(ValueError, "multiclass format is not supported",
|
|
roc_auc_score, y_true, y_pred)
|
|
|
|
clean_warning_registry()
|
|
with warnings.catch_warnings(record=True):
|
|
rng = check_random_state(404)
|
|
y_pred = rng.rand(10)
|
|
# y_true contains only one class value
|
|
y_true = np.zeros(10, dtype="int")
|
|
assert_raise_message(ValueError, "ROC AUC score is not defined",
|
|
roc_auc_score, y_true, y_pred)
|
|
y_true = np.ones(10, dtype="int")
|
|
assert_raise_message(ValueError, "ROC AUC score is not defined",
|
|
roc_auc_score, y_true, y_pred)
|
|
y_true = -np.ones(10, dtype="int")
|
|
assert_raise_message(ValueError, "ROC AUC score is not defined",
|
|
roc_auc_score, y_true, y_pred)
|
|
|
|
# y_true contains three different class values
|
|
y_true = rng.randint(0, 3, size=10)
|
|
assert_raise_message(ValueError, "multiclass format is not supported",
|
|
roc_auc_score, y_true, y_pred)
|
|
|
|
|
|
def test_binary_clf_curve():
|
|
rng = check_random_state(404)
|
|
y_true = rng.randint(0, 3, size=10)
|
|
y_pred = rng.rand(10)
|
|
msg = "multiclass format is not supported"
|
|
assert_raise_message(ValueError, msg, precision_recall_curve,
|
|
y_true, y_pred)
|
|
|
|
def test_precision_recall_curve():
|
|
y_true, _, probas_pred = make_prediction(binary=True)
|
|
_test_precision_recall_curve(y_true, probas_pred)
|
|
|
|
# Use {-1, 1} for labels; make sure original labels aren't modified
|
|
y_true[np.where(y_true == 0)] = -1
|
|
y_true_copy = y_true.copy()
|
|
_test_precision_recall_curve(y_true, probas_pred)
|
|
assert_array_equal(y_true_copy, y_true)
|
|
|
|
labels = [1, 0, 0, 1]
|
|
predict_probas = [1, 2, 3, 4]
|
|
p, r, t = precision_recall_curve(labels, predict_probas)
|
|
assert_array_almost_equal(p, np.array([0.5, 0.33333333, 0.5, 1., 1.]))
|
|
assert_array_almost_equal(r, np.array([1., 0.5, 0.5, 0.5, 0.]))
|
|
assert_array_almost_equal(t, np.array([1, 2, 3, 4]))
|
|
assert_equal(p.size, r.size)
|
|
assert_equal(p.size, t.size + 1)
|
|
|
|
|
|
def test_precision_recall_curve_pos_label():
|
|
y_true, _, probas_pred = make_prediction(binary=False)
|
|
pos_label = 2
|
|
p, r, thresholds = precision_recall_curve(y_true,
|
|
probas_pred[:, pos_label],
|
|
pos_label=pos_label)
|
|
p2, r2, thresholds2 = precision_recall_curve(y_true == pos_label,
|
|
probas_pred[:, pos_label])
|
|
assert_array_almost_equal(p, p2)
|
|
assert_array_almost_equal(r, r2)
|
|
assert_array_almost_equal(thresholds, thresholds2)
|
|
assert_equal(p.size, r.size)
|
|
assert_equal(p.size, thresholds.size + 1)
|
|
|
|
|
|
def _test_precision_recall_curve(y_true, probas_pred):
|
|
# Test Precision-Recall and aread under PR curve
|
|
p, r, thresholds = precision_recall_curve(y_true, probas_pred)
|
|
precision_recall_auc = _average_precision_slow(y_true, probas_pred)
|
|
assert_array_almost_equal(precision_recall_auc, 0.859, 3)
|
|
assert_array_almost_equal(precision_recall_auc,
|
|
average_precision_score(y_true, probas_pred))
|
|
assert_almost_equal(_average_precision(y_true, probas_pred),
|
|
precision_recall_auc, decimal=3)
|
|
assert_equal(p.size, r.size)
|
|
assert_equal(p.size, thresholds.size + 1)
|
|
# Smoke test in the case of proba having only one value
|
|
p, r, thresholds = precision_recall_curve(y_true,
|
|
np.zeros_like(probas_pred))
|
|
assert_equal(p.size, r.size)
|
|
assert_equal(p.size, thresholds.size + 1)
|
|
|
|
|
|
def test_precision_recall_curve_errors():
|
|
# Contains non-binary labels
|
|
assert_raises(ValueError, precision_recall_curve,
|
|
[0, 1, 2], [[0.0], [1.0], [1.0]])
|
|
|
|
|
|
def test_precision_recall_curve_toydata():
|
|
with np.errstate(all="raise"):
|
|
# Binary classification
|
|
y_true = [0, 1]
|
|
y_score = [0, 1]
|
|
p, r, _ = precision_recall_curve(y_true, y_score)
|
|
auc_prc = average_precision_score(y_true, y_score)
|
|
assert_array_almost_equal(p, [1, 1])
|
|
assert_array_almost_equal(r, [1, 0])
|
|
assert_almost_equal(auc_prc, 1.)
|
|
|
|
y_true = [0, 1]
|
|
y_score = [1, 0]
|
|
p, r, _ = precision_recall_curve(y_true, y_score)
|
|
auc_prc = average_precision_score(y_true, y_score)
|
|
assert_array_almost_equal(p, [0.5, 0., 1.])
|
|
assert_array_almost_equal(r, [1., 0., 0.])
|
|
# Here we are doing a terrible prediction: we are always getting
|
|
# it wrong, hence the average_precision_score is the accuracy at
|
|
# chance: 50%
|
|
assert_almost_equal(auc_prc, 0.5)
|
|
|
|
y_true = [1, 0]
|
|
y_score = [1, 1]
|
|
p, r, _ = precision_recall_curve(y_true, y_score)
|
|
auc_prc = average_precision_score(y_true, y_score)
|
|
assert_array_almost_equal(p, [0.5, 1])
|
|
assert_array_almost_equal(r, [1., 0])
|
|
assert_almost_equal(auc_prc, .5)
|
|
|
|
y_true = [1, 0]
|
|
y_score = [1, 0]
|
|
p, r, _ = precision_recall_curve(y_true, y_score)
|
|
auc_prc = average_precision_score(y_true, y_score)
|
|
assert_array_almost_equal(p, [1, 1])
|
|
assert_array_almost_equal(r, [1, 0])
|
|
assert_almost_equal(auc_prc, 1.)
|
|
|
|
y_true = [1, 0]
|
|
y_score = [0.5, 0.5]
|
|
p, r, _ = precision_recall_curve(y_true, y_score)
|
|
auc_prc = average_precision_score(y_true, y_score)
|
|
assert_array_almost_equal(p, [0.5, 1])
|
|
assert_array_almost_equal(r, [1, 0.])
|
|
assert_almost_equal(auc_prc, .5)
|
|
|
|
y_true = [0, 0]
|
|
y_score = [0.25, 0.75]
|
|
assert_raises(Exception, precision_recall_curve, y_true, y_score)
|
|
assert_raises(Exception, average_precision_score, y_true, y_score)
|
|
|
|
y_true = [1, 1]
|
|
y_score = [0.25, 0.75]
|
|
p, r, _ = precision_recall_curve(y_true, y_score)
|
|
assert_almost_equal(average_precision_score(y_true, y_score), 1.)
|
|
assert_array_almost_equal(p, [1., 1., 1.])
|
|
assert_array_almost_equal(r, [1, 0.5, 0.])
|
|
|
|
# Multi-label classification task
|
|
y_true = np.array([[0, 1], [0, 1]])
|
|
y_score = np.array([[0, 1], [0, 1]])
|
|
assert_raises(Exception, average_precision_score, y_true, y_score,
|
|
average="macro")
|
|
assert_raises(Exception, average_precision_score, y_true, y_score,
|
|
average="weighted")
|
|
assert_almost_equal(average_precision_score(y_true, y_score,
|
|
average="samples"), 1.)
|
|
assert_almost_equal(average_precision_score(y_true, y_score,
|
|
average="micro"), 1.)
|
|
|
|
y_true = np.array([[0, 1], [0, 1]])
|
|
y_score = np.array([[0, 1], [1, 0]])
|
|
assert_raises(Exception, average_precision_score, y_true, y_score,
|
|
average="macro")
|
|
assert_raises(Exception, average_precision_score, y_true, y_score,
|
|
average="weighted")
|
|
assert_almost_equal(average_precision_score(y_true, y_score,
|
|
average="samples"), 0.75)
|
|
assert_almost_equal(average_precision_score(y_true, y_score,
|
|
average="micro"), 0.5)
|
|
|
|
y_true = np.array([[1, 0], [0, 1]])
|
|
y_score = np.array([[0, 1], [1, 0]])
|
|
assert_almost_equal(average_precision_score(y_true, y_score,
|
|
average="macro"), 0.5)
|
|
assert_almost_equal(average_precision_score(y_true, y_score,
|
|
average="weighted"), 0.5)
|
|
assert_almost_equal(average_precision_score(y_true, y_score,
|
|
average="samples"), 0.5)
|
|
assert_almost_equal(average_precision_score(y_true, y_score,
|
|
average="micro"), 0.5)
|
|
|
|
y_true = np.array([[1, 0], [0, 1]])
|
|
y_score = np.array([[0.5, 0.5], [0.5, 0.5]])
|
|
assert_almost_equal(average_precision_score(y_true, y_score,
|
|
average="macro"), 0.5)
|
|
assert_almost_equal(average_precision_score(y_true, y_score,
|
|
average="weighted"), 0.5)
|
|
assert_almost_equal(average_precision_score(y_true, y_score,
|
|
average="samples"), 0.5)
|
|
assert_almost_equal(average_precision_score(y_true, y_score,
|
|
average="micro"), 0.5)
|
|
|
|
|
|
def test_average_precision_constant_values():
|
|
# Check the average_precision_score of a constant predictor is
|
|
# the TPR
|
|
|
|
# Generate a dataset with 25% of positives
|
|
y_true = np.zeros(100, dtype=int)
|
|
y_true[::4] = 1
|
|
# And a constant score
|
|
y_score = np.ones(100)
|
|
# The precision is then the fraction of positive whatever the recall
|
|
# is, as there is only one threshold:
|
|
assert_equal(average_precision_score(y_true, y_score), .25)
|
|
|
|
|
|
def test_score_scale_invariance():
|
|
# Test that average_precision_score and roc_auc_score are invariant by
|
|
# the scaling or shifting of probabilities
|
|
# This test was expanded (added scaled_down) in response to github
|
|
# issue #3864 (and others), where overly aggressive rounding was causing
|
|
# problems for users with very small y_score values
|
|
y_true, _, probas_pred = make_prediction(binary=True)
|
|
|
|
roc_auc = roc_auc_score(y_true, probas_pred)
|
|
roc_auc_scaled_up = roc_auc_score(y_true, 100 * probas_pred)
|
|
roc_auc_scaled_down = roc_auc_score(y_true, 1e-6 * probas_pred)
|
|
roc_auc_shifted = roc_auc_score(y_true, probas_pred - 10)
|
|
assert_equal(roc_auc, roc_auc_scaled_up)
|
|
assert_equal(roc_auc, roc_auc_scaled_down)
|
|
assert_equal(roc_auc, roc_auc_shifted)
|
|
|
|
pr_auc = average_precision_score(y_true, probas_pred)
|
|
pr_auc_scaled_up = average_precision_score(y_true, 100 * probas_pred)
|
|
pr_auc_scaled_down = average_precision_score(y_true, 1e-6 * probas_pred)
|
|
pr_auc_shifted = average_precision_score(y_true, probas_pred - 10)
|
|
assert_equal(pr_auc, pr_auc_scaled_up)
|
|
assert_equal(pr_auc, pr_auc_scaled_down)
|
|
assert_equal(pr_auc, pr_auc_shifted)
|
|
|
|
|
|
def check_lrap_toy(lrap_score):
|
|
# Check on several small example that it works
|
|
assert_almost_equal(lrap_score([[0, 1]], [[0.25, 0.75]]), 1)
|
|
assert_almost_equal(lrap_score([[0, 1]], [[0.75, 0.25]]), 1 / 2)
|
|
assert_almost_equal(lrap_score([[1, 1]], [[0.75, 0.25]]), 1)
|
|
|
|
assert_almost_equal(lrap_score([[0, 0, 1]], [[0.25, 0.5, 0.75]]), 1)
|
|
assert_almost_equal(lrap_score([[0, 1, 0]], [[0.25, 0.5, 0.75]]), 1 / 2)
|
|
assert_almost_equal(lrap_score([[0, 1, 1]], [[0.25, 0.5, 0.75]]), 1)
|
|
assert_almost_equal(lrap_score([[1, 0, 0]], [[0.25, 0.5, 0.75]]), 1 / 3)
|
|
assert_almost_equal(lrap_score([[1, 0, 1]], [[0.25, 0.5, 0.75]]),
|
|
(2 / 3 + 1 / 1) / 2)
|
|
assert_almost_equal(lrap_score([[1, 1, 0]], [[0.25, 0.5, 0.75]]),
|
|
(2 / 3 + 1 / 2) / 2)
|
|
|
|
assert_almost_equal(lrap_score([[0, 0, 1]], [[0.75, 0.5, 0.25]]), 1 / 3)
|
|
assert_almost_equal(lrap_score([[0, 1, 0]], [[0.75, 0.5, 0.25]]), 1 / 2)
|
|
assert_almost_equal(lrap_score([[0, 1, 1]], [[0.75, 0.5, 0.25]]),
|
|
(1 / 2 + 2 / 3) / 2)
|
|
assert_almost_equal(lrap_score([[1, 0, 0]], [[0.75, 0.5, 0.25]]), 1)
|
|
assert_almost_equal(lrap_score([[1, 0, 1]], [[0.75, 0.5, 0.25]]),
|
|
(1 + 2 / 3) / 2)
|
|
assert_almost_equal(lrap_score([[1, 1, 0]], [[0.75, 0.5, 0.25]]), 1)
|
|
assert_almost_equal(lrap_score([[1, 1, 1]], [[0.75, 0.5, 0.25]]), 1)
|
|
|
|
assert_almost_equal(lrap_score([[0, 0, 1]], [[0.5, 0.75, 0.25]]), 1 / 3)
|
|
assert_almost_equal(lrap_score([[0, 1, 0]], [[0.5, 0.75, 0.25]]), 1)
|
|
assert_almost_equal(lrap_score([[0, 1, 1]], [[0.5, 0.75, 0.25]]),
|
|
(1 + 2 / 3) / 2)
|
|
assert_almost_equal(lrap_score([[1, 0, 0]], [[0.5, 0.75, 0.25]]), 1 / 2)
|
|
assert_almost_equal(lrap_score([[1, 0, 1]], [[0.5, 0.75, 0.25]]),
|
|
(1 / 2 + 2 / 3) / 2)
|
|
assert_almost_equal(lrap_score([[1, 1, 0]], [[0.5, 0.75, 0.25]]), 1)
|
|
assert_almost_equal(lrap_score([[1, 1, 1]], [[0.5, 0.75, 0.25]]), 1)
|
|
|
|
# Tie handling
|
|
assert_almost_equal(lrap_score([[1, 0]], [[0.5, 0.5]]), 0.5)
|
|
assert_almost_equal(lrap_score([[0, 1]], [[0.5, 0.5]]), 0.5)
|
|
assert_almost_equal(lrap_score([[1, 1]], [[0.5, 0.5]]), 1)
|
|
|
|
assert_almost_equal(lrap_score([[0, 0, 1]], [[0.25, 0.5, 0.5]]), 0.5)
|
|
assert_almost_equal(lrap_score([[0, 1, 0]], [[0.25, 0.5, 0.5]]), 0.5)
|
|
assert_almost_equal(lrap_score([[0, 1, 1]], [[0.25, 0.5, 0.5]]), 1)
|
|
assert_almost_equal(lrap_score([[1, 0, 0]], [[0.25, 0.5, 0.5]]), 1 / 3)
|
|
assert_almost_equal(lrap_score([[1, 0, 1]], [[0.25, 0.5, 0.5]]),
|
|
(2 / 3 + 1 / 2) / 2)
|
|
assert_almost_equal(lrap_score([[1, 1, 0]], [[0.25, 0.5, 0.5]]),
|
|
(2 / 3 + 1 / 2) / 2)
|
|
assert_almost_equal(lrap_score([[1, 1, 1]], [[0.25, 0.5, 0.5]]), 1)
|
|
|
|
assert_almost_equal(lrap_score([[1, 1, 0]], [[0.5, 0.5, 0.5]]), 2 / 3)
|
|
|
|
assert_almost_equal(lrap_score([[1, 1, 1, 0]], [[0.5, 0.5, 0.5, 0.5]]),
|
|
3 / 4)
|
|
|
|
|
|
def check_zero_or_all_relevant_labels(lrap_score):
|
|
random_state = check_random_state(0)
|
|
|
|
for n_labels in range(2, 5):
|
|
y_score = random_state.uniform(size=(1, n_labels))
|
|
y_score_ties = np.zeros_like(y_score)
|
|
|
|
# No relevant labels
|
|
y_true = np.zeros((1, n_labels))
|
|
assert_equal(lrap_score(y_true, y_score), 1.)
|
|
assert_equal(lrap_score(y_true, y_score_ties), 1.)
|
|
|
|
# Only relevant labels
|
|
y_true = np.ones((1, n_labels))
|
|
assert_equal(lrap_score(y_true, y_score), 1.)
|
|
assert_equal(lrap_score(y_true, y_score_ties), 1.)
|
|
|
|
# Degenerate case: only one label
|
|
assert_almost_equal(lrap_score([[1], [0], [1], [0]],
|
|
[[0.5], [0.5], [0.5], [0.5]]), 1.)
|
|
|
|
|
|
def check_lrap_error_raised(lrap_score):
|
|
# Raise value error if not appropriate format
|
|
assert_raises(ValueError, lrap_score,
|
|
[0, 1, 0], [0.25, 0.3, 0.2])
|
|
assert_raises(ValueError, lrap_score, [0, 1, 2],
|
|
[[0.25, 0.75, 0.0], [0.7, 0.3, 0.0], [0.8, 0.2, 0.0]])
|
|
assert_raises(ValueError, lrap_score, [(0), (1), (2)],
|
|
[[0.25, 0.75, 0.0], [0.7, 0.3, 0.0], [0.8, 0.2, 0.0]])
|
|
|
|
# Check that y_true.shape != y_score.shape raise the proper exception
|
|
assert_raises(ValueError, lrap_score, [[0, 1], [0, 1]], [0, 1])
|
|
assert_raises(ValueError, lrap_score, [[0, 1], [0, 1]], [[0, 1]])
|
|
assert_raises(ValueError, lrap_score, [[0, 1], [0, 1]], [[0], [1]])
|
|
assert_raises(ValueError, lrap_score, [[0, 1]], [[0, 1], [0, 1]])
|
|
assert_raises(ValueError, lrap_score, [[0], [1]], [[0, 1], [0, 1]])
|
|
assert_raises(ValueError, lrap_score, [[0, 1], [0, 1]], [[0], [1]])
|
|
|
|
|
|
def check_lrap_only_ties(lrap_score):
|
|
# Check tie handling in score
|
|
# Basic check with only ties and increasing label space
|
|
for n_labels in range(2, 10):
|
|
y_score = np.ones((1, n_labels))
|
|
|
|
# Check for growing number of consecutive relevant
|
|
for n_relevant in range(1, n_labels):
|
|
# Check for a bunch of positions
|
|
for pos in range(n_labels - n_relevant):
|
|
y_true = np.zeros((1, n_labels))
|
|
y_true[0, pos:pos + n_relevant] = 1
|
|
assert_almost_equal(lrap_score(y_true, y_score),
|
|
n_relevant / n_labels)
|
|
|
|
|
|
def check_lrap_without_tie_and_increasing_score(lrap_score):
|
|
# Check that Label ranking average precision works for various
|
|
# Basic check with increasing label space size and decreasing score
|
|
for n_labels in range(2, 10):
|
|
y_score = n_labels - (np.arange(n_labels).reshape((1, n_labels)) + 1)
|
|
|
|
# First and last
|
|
y_true = np.zeros((1, n_labels))
|
|
y_true[0, 0] = 1
|
|
y_true[0, -1] = 1
|
|
assert_almost_equal(lrap_score(y_true, y_score),
|
|
(2 / n_labels + 1) / 2)
|
|
|
|
# Check for growing number of consecutive relevant label
|
|
for n_relevant in range(1, n_labels):
|
|
# Check for a bunch of position
|
|
for pos in range(n_labels - n_relevant):
|
|
y_true = np.zeros((1, n_labels))
|
|
y_true[0, pos:pos + n_relevant] = 1
|
|
assert_almost_equal(lrap_score(y_true, y_score),
|
|
sum((r + 1) / ((pos + r + 1) * n_relevant)
|
|
for r in range(n_relevant)))
|
|
|
|
|
|
def _my_lrap(y_true, y_score):
|
|
"""Simple implementation of label ranking average precision"""
|
|
check_consistent_length(y_true, y_score)
|
|
y_true = check_array(y_true)
|
|
y_score = check_array(y_score)
|
|
n_samples, n_labels = y_true.shape
|
|
score = np.empty((n_samples, ))
|
|
for i in range(n_samples):
|
|
# The best rank correspond to 1. Rank higher than 1 are worse.
|
|
# The best inverse ranking correspond to n_labels.
|
|
unique_rank, inv_rank = np.unique(y_score[i], return_inverse=True)
|
|
n_ranks = unique_rank.size
|
|
rank = n_ranks - inv_rank
|
|
|
|
# Rank need to be corrected to take into account ties
|
|
# ex: rank 1 ex aequo means that both label are rank 2.
|
|
corr_rank = np.bincount(rank, minlength=n_ranks + 1).cumsum()
|
|
rank = corr_rank[rank]
|
|
|
|
relevant = y_true[i].nonzero()[0]
|
|
if relevant.size == 0 or relevant.size == n_labels:
|
|
score[i] = 1
|
|
continue
|
|
|
|
score[i] = 0.
|
|
for label in relevant:
|
|
# Let's count the number of relevant label with better rank
|
|
# (smaller rank).
|
|
n_ranked_above = sum(rank[r] <= rank[label] for r in relevant)
|
|
|
|
# Weight by the rank of the actual label
|
|
score[i] += n_ranked_above / rank[label]
|
|
|
|
score[i] /= relevant.size
|
|
|
|
return score.mean()
|
|
|
|
|
|
def check_alternative_lrap_implementation(lrap_score, n_classes=5,
|
|
n_samples=20, random_state=0):
|
|
_, y_true = make_multilabel_classification(n_features=1,
|
|
allow_unlabeled=False,
|
|
random_state=random_state,
|
|
n_classes=n_classes,
|
|
n_samples=n_samples)
|
|
|
|
# Score with ties
|
|
y_score = sparse_random_matrix(n_components=y_true.shape[0],
|
|
n_features=y_true.shape[1],
|
|
random_state=random_state)
|
|
|
|
if hasattr(y_score, "toarray"):
|
|
y_score = y_score.toarray()
|
|
score_lrap = label_ranking_average_precision_score(y_true, y_score)
|
|
score_my_lrap = _my_lrap(y_true, y_score)
|
|
assert_almost_equal(score_lrap, score_my_lrap)
|
|
|
|
# Uniform score
|
|
random_state = check_random_state(random_state)
|
|
y_score = random_state.uniform(size=(n_samples, n_classes))
|
|
score_lrap = label_ranking_average_precision_score(y_true, y_score)
|
|
score_my_lrap = _my_lrap(y_true, y_score)
|
|
assert_almost_equal(score_lrap, score_my_lrap)
|
|
|
|
|
|
def test_label_ranking_avp():
|
|
for fn in [label_ranking_average_precision_score, _my_lrap]:
|
|
yield check_lrap_toy, fn
|
|
yield check_lrap_without_tie_and_increasing_score, fn
|
|
yield check_lrap_only_ties, fn
|
|
yield check_zero_or_all_relevant_labels, fn
|
|
yield check_lrap_error_raised, label_ranking_average_precision_score
|
|
|
|
for n_samples, n_classes, random_state in product((1, 2, 8, 20),
|
|
(2, 5, 10),
|
|
range(1)):
|
|
yield (check_alternative_lrap_implementation,
|
|
label_ranking_average_precision_score,
|
|
n_classes, n_samples, random_state)
|
|
|
|
|
|
def test_coverage_error():
|
|
# Toy case
|
|
assert_almost_equal(coverage_error([[0, 1]], [[0.25, 0.75]]), 1)
|
|
assert_almost_equal(coverage_error([[0, 1]], [[0.75, 0.25]]), 2)
|
|
assert_almost_equal(coverage_error([[1, 1]], [[0.75, 0.25]]), 2)
|
|
assert_almost_equal(coverage_error([[0, 0]], [[0.75, 0.25]]), 0)
|
|
|
|
assert_almost_equal(coverage_error([[0, 0, 0]], [[0.25, 0.5, 0.75]]), 0)
|
|
assert_almost_equal(coverage_error([[0, 0, 1]], [[0.25, 0.5, 0.75]]), 1)
|
|
assert_almost_equal(coverage_error([[0, 1, 0]], [[0.25, 0.5, 0.75]]), 2)
|
|
assert_almost_equal(coverage_error([[0, 1, 1]], [[0.25, 0.5, 0.75]]), 2)
|
|
assert_almost_equal(coverage_error([[1, 0, 0]], [[0.25, 0.5, 0.75]]), 3)
|
|
assert_almost_equal(coverage_error([[1, 0, 1]], [[0.25, 0.5, 0.75]]), 3)
|
|
assert_almost_equal(coverage_error([[1, 1, 0]], [[0.25, 0.5, 0.75]]), 3)
|
|
assert_almost_equal(coverage_error([[1, 1, 1]], [[0.25, 0.5, 0.75]]), 3)
|
|
|
|
assert_almost_equal(coverage_error([[0, 0, 0]], [[0.75, 0.5, 0.25]]), 0)
|
|
assert_almost_equal(coverage_error([[0, 0, 1]], [[0.75, 0.5, 0.25]]), 3)
|
|
assert_almost_equal(coverage_error([[0, 1, 0]], [[0.75, 0.5, 0.25]]), 2)
|
|
assert_almost_equal(coverage_error([[0, 1, 1]], [[0.75, 0.5, 0.25]]), 3)
|
|
assert_almost_equal(coverage_error([[1, 0, 0]], [[0.75, 0.5, 0.25]]), 1)
|
|
assert_almost_equal(coverage_error([[1, 0, 1]], [[0.75, 0.5, 0.25]]), 3)
|
|
assert_almost_equal(coverage_error([[1, 1, 0]], [[0.75, 0.5, 0.25]]), 2)
|
|
assert_almost_equal(coverage_error([[1, 1, 1]], [[0.75, 0.5, 0.25]]), 3)
|
|
|
|
assert_almost_equal(coverage_error([[0, 0, 0]], [[0.5, 0.75, 0.25]]), 0)
|
|
assert_almost_equal(coverage_error([[0, 0, 1]], [[0.5, 0.75, 0.25]]), 3)
|
|
assert_almost_equal(coverage_error([[0, 1, 0]], [[0.5, 0.75, 0.25]]), 1)
|
|
assert_almost_equal(coverage_error([[0, 1, 1]], [[0.5, 0.75, 0.25]]), 3)
|
|
assert_almost_equal(coverage_error([[1, 0, 0]], [[0.5, 0.75, 0.25]]), 2)
|
|
assert_almost_equal(coverage_error([[1, 0, 1]], [[0.5, 0.75, 0.25]]), 3)
|
|
assert_almost_equal(coverage_error([[1, 1, 0]], [[0.5, 0.75, 0.25]]), 2)
|
|
assert_almost_equal(coverage_error([[1, 1, 1]], [[0.5, 0.75, 0.25]]), 3)
|
|
|
|
# Non trival case
|
|
assert_almost_equal(coverage_error([[0, 1, 0], [1, 1, 0]],
|
|
[[0.1, 10., -3], [0, 1, 3]]),
|
|
(1 + 3) / 2.)
|
|
|
|
assert_almost_equal(coverage_error([[0, 1, 0], [1, 1, 0], [0, 1, 1]],
|
|
[[0.1, 10, -3], [0, 1, 3], [0, 2, 0]]),
|
|
(1 + 3 + 3) / 3.)
|
|
|
|
assert_almost_equal(coverage_error([[0, 1, 0], [1, 1, 0], [0, 1, 1]],
|
|
[[0.1, 10, -3], [3, 1, 3], [0, 2, 0]]),
|
|
(1 + 3 + 3) / 3.)
|
|
|
|
|
|
def test_coverage_tie_handling():
|
|
assert_almost_equal(coverage_error([[0, 0]], [[0.5, 0.5]]), 0)
|
|
assert_almost_equal(coverage_error([[1, 0]], [[0.5, 0.5]]), 2)
|
|
assert_almost_equal(coverage_error([[0, 1]], [[0.5, 0.5]]), 2)
|
|
assert_almost_equal(coverage_error([[1, 1]], [[0.5, 0.5]]), 2)
|
|
|
|
assert_almost_equal(coverage_error([[0, 0, 0]], [[0.25, 0.5, 0.5]]), 0)
|
|
assert_almost_equal(coverage_error([[0, 0, 1]], [[0.25, 0.5, 0.5]]), 2)
|
|
assert_almost_equal(coverage_error([[0, 1, 0]], [[0.25, 0.5, 0.5]]), 2)
|
|
assert_almost_equal(coverage_error([[0, 1, 1]], [[0.25, 0.5, 0.5]]), 2)
|
|
assert_almost_equal(coverage_error([[1, 0, 0]], [[0.25, 0.5, 0.5]]), 3)
|
|
assert_almost_equal(coverage_error([[1, 0, 1]], [[0.25, 0.5, 0.5]]), 3)
|
|
assert_almost_equal(coverage_error([[1, 1, 0]], [[0.25, 0.5, 0.5]]), 3)
|
|
assert_almost_equal(coverage_error([[1, 1, 1]], [[0.25, 0.5, 0.5]]), 3)
|
|
|
|
|
|
def test_label_ranking_loss():
|
|
assert_almost_equal(label_ranking_loss([[0, 1]], [[0.25, 0.75]]), 0)
|
|
assert_almost_equal(label_ranking_loss([[0, 1]], [[0.75, 0.25]]), 1)
|
|
|
|
assert_almost_equal(label_ranking_loss([[0, 0, 1]], [[0.25, 0.5, 0.75]]),
|
|
0)
|
|
assert_almost_equal(label_ranking_loss([[0, 1, 0]], [[0.25, 0.5, 0.75]]),
|
|
1 / 2)
|
|
assert_almost_equal(label_ranking_loss([[0, 1, 1]], [[0.25, 0.5, 0.75]]),
|
|
0)
|
|
assert_almost_equal(label_ranking_loss([[1, 0, 0]], [[0.25, 0.5, 0.75]]),
|
|
2 / 2)
|
|
assert_almost_equal(label_ranking_loss([[1, 0, 1]], [[0.25, 0.5, 0.75]]),
|
|
1 / 2)
|
|
assert_almost_equal(label_ranking_loss([[1, 1, 0]], [[0.25, 0.5, 0.75]]),
|
|
2 / 2)
|
|
|
|
# Undefined metrics - the ranking doesn't matter
|
|
assert_almost_equal(label_ranking_loss([[0, 0]], [[0.75, 0.25]]), 0)
|
|
assert_almost_equal(label_ranking_loss([[1, 1]], [[0.75, 0.25]]), 0)
|
|
assert_almost_equal(label_ranking_loss([[0, 0]], [[0.5, 0.5]]), 0)
|
|
assert_almost_equal(label_ranking_loss([[1, 1]], [[0.5, 0.5]]), 0)
|
|
|
|
assert_almost_equal(label_ranking_loss([[0, 0, 0]], [[0.5, 0.75, 0.25]]),
|
|
0)
|
|
assert_almost_equal(label_ranking_loss([[1, 1, 1]], [[0.5, 0.75, 0.25]]),
|
|
0)
|
|
assert_almost_equal(label_ranking_loss([[0, 0, 0]], [[0.25, 0.5, 0.5]]),
|
|
0)
|
|
assert_almost_equal(label_ranking_loss([[1, 1, 1]], [[0.25, 0.5, 0.5]]), 0)
|
|
|
|
# Non trival case
|
|
assert_almost_equal(label_ranking_loss([[0, 1, 0], [1, 1, 0]],
|
|
[[0.1, 10., -3], [0, 1, 3]]),
|
|
(0 + 2 / 2) / 2.)
|
|
|
|
assert_almost_equal(label_ranking_loss(
|
|
[[0, 1, 0], [1, 1, 0], [0, 1, 1]],
|
|
[[0.1, 10, -3], [0, 1, 3], [0, 2, 0]]),
|
|
(0 + 2 / 2 + 1 / 2) / 3.)
|
|
|
|
assert_almost_equal(label_ranking_loss(
|
|
[[0, 1, 0], [1, 1, 0], [0, 1, 1]],
|
|
[[0.1, 10, -3], [3, 1, 3], [0, 2, 0]]),
|
|
(0 + 2 / 2 + 1 / 2) / 3.)
|
|
|
|
# Sparse csr matrices
|
|
assert_almost_equal(label_ranking_loss(
|
|
csr_matrix(np.array([[0, 1, 0], [1, 1, 0]])),
|
|
[[0.1, 10, -3], [3, 1, 3]]),
|
|
(0 + 2 / 2) / 2.)
|
|
|
|
|
|
def test_ranking_appropriate_input_shape():
|
|
# Check that y_true.shape != y_score.shape raise the proper exception
|
|
assert_raises(ValueError, label_ranking_loss, [[0, 1], [0, 1]], [0, 1])
|
|
assert_raises(ValueError, label_ranking_loss, [[0, 1], [0, 1]], [[0, 1]])
|
|
assert_raises(ValueError, label_ranking_loss,
|
|
[[0, 1], [0, 1]], [[0], [1]])
|
|
|
|
assert_raises(ValueError, label_ranking_loss, [[0, 1]], [[0, 1], [0, 1]])
|
|
assert_raises(ValueError, label_ranking_loss,
|
|
[[0], [1]], [[0, 1], [0, 1]])
|
|
assert_raises(ValueError, label_ranking_loss, [[0, 1], [0, 1]], [[0], [1]])
|
|
|
|
|
|
def test_ranking_loss_ties_handling():
|
|
# Tie handling
|
|
assert_almost_equal(label_ranking_loss([[1, 0]], [[0.5, 0.5]]), 1)
|
|
assert_almost_equal(label_ranking_loss([[0, 1]], [[0.5, 0.5]]), 1)
|
|
assert_almost_equal(label_ranking_loss([[0, 0, 1]], [[0.25, 0.5, 0.5]]),
|
|
1 / 2)
|
|
assert_almost_equal(label_ranking_loss([[0, 1, 0]], [[0.25, 0.5, 0.5]]),
|
|
1 / 2)
|
|
assert_almost_equal(label_ranking_loss([[0, 1, 1]], [[0.25, 0.5, 0.5]]), 0)
|
|
assert_almost_equal(label_ranking_loss([[1, 0, 0]], [[0.25, 0.5, 0.5]]), 1)
|
|
assert_almost_equal(label_ranking_loss([[1, 0, 1]], [[0.25, 0.5, 0.5]]), 1)
|
|
assert_almost_equal(label_ranking_loss([[1, 1, 0]], [[0.25, 0.5, 0.5]]), 1)
|