797 lines
29 KiB
Python
797 lines
29 KiB
Python
|
"""Metrics to assess performance on classification task given scores
|
||
|
|
||
|
Functions named as ``*_score`` return a scalar value to maximize: the higher
|
||
|
the better
|
||
|
|
||
|
Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize:
|
||
|
the lower the better
|
||
|
"""
|
||
|
|
||
|
# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||
|
# Mathieu Blondel <mathieu@mblondel.org>
|
||
|
# Olivier Grisel <olivier.grisel@ensta.org>
|
||
|
# Arnaud Joly <a.joly@ulg.ac.be>
|
||
|
# Jochen Wersdorfer <jochen@wersdoerfer.de>
|
||
|
# Lars Buitinck
|
||
|
# Joel Nothman <joel.nothman@gmail.com>
|
||
|
# Noel Dawe <noel@dawe.me>
|
||
|
# License: BSD 3 clause
|
||
|
|
||
|
from __future__ import division
|
||
|
|
||
|
import warnings
|
||
|
import numpy as np
|
||
|
from scipy.sparse import csr_matrix
|
||
|
from scipy.stats import rankdata
|
||
|
|
||
|
from ..utils import assert_all_finite
|
||
|
from ..utils import check_consistent_length
|
||
|
from ..utils import column_or_1d, check_array
|
||
|
from ..utils.multiclass import type_of_target
|
||
|
from ..utils.extmath import stable_cumsum
|
||
|
from ..utils.sparsefuncs import count_nonzero
|
||
|
from ..exceptions import UndefinedMetricWarning
|
||
|
from ..preprocessing import LabelBinarizer
|
||
|
|
||
|
from .base import _average_binary_score
|
||
|
|
||
|
|
||
|
def auc(x, y, reorder=False):
|
||
|
"""Compute Area Under the Curve (AUC) using the trapezoidal rule
|
||
|
|
||
|
This is a general function, given points on a curve. For computing the
|
||
|
area under the ROC-curve, see :func:`roc_auc_score`. For an alternative
|
||
|
way to summarize a precision-recall curve, see
|
||
|
:func:`average_precision_score`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
x : array, shape = [n]
|
||
|
x coordinates.
|
||
|
y : array, shape = [n]
|
||
|
y coordinates.
|
||
|
reorder : boolean, optional (default=False)
|
||
|
If True, assume that the curve is ascending in the case of ties, as for
|
||
|
an ROC curve. If the curve is non-ascending, the result will be wrong.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
auc : float
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> import numpy as np
|
||
|
>>> from sklearn import metrics
|
||
|
>>> y = np.array([1, 1, 2, 2])
|
||
|
>>> pred = np.array([0.1, 0.4, 0.35, 0.8])
|
||
|
>>> fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)
|
||
|
>>> metrics.auc(fpr, tpr)
|
||
|
0.75
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
roc_auc_score : Compute the area under the ROC curve
|
||
|
average_precision_score : Compute average precision from prediction scores
|
||
|
precision_recall_curve :
|
||
|
Compute precision-recall pairs for different probability thresholds
|
||
|
"""
|
||
|
check_consistent_length(x, y)
|
||
|
x = column_or_1d(x)
|
||
|
y = column_or_1d(y)
|
||
|
|
||
|
if x.shape[0] < 2:
|
||
|
raise ValueError('At least 2 points are needed to compute'
|
||
|
' area under curve, but x.shape = %s' % x.shape)
|
||
|
|
||
|
direction = 1
|
||
|
if reorder:
|
||
|
# reorder the data points according to the x axis and using y to
|
||
|
# break ties
|
||
|
order = np.lexsort((y, x))
|
||
|
x, y = x[order], y[order]
|
||
|
else:
|
||
|
dx = np.diff(x)
|
||
|
if np.any(dx < 0):
|
||
|
if np.all(dx <= 0):
|
||
|
direction = -1
|
||
|
else:
|
||
|
raise ValueError("Reordering is not turned on, and "
|
||
|
"the x array is not increasing: %s" % x)
|
||
|
|
||
|
area = direction * np.trapz(y, x)
|
||
|
if isinstance(area, np.memmap):
|
||
|
# Reductions such as .sum used internally in np.trapz do not return a
|
||
|
# scalar by default for numpy.memmap instances contrary to
|
||
|
# regular numpy.ndarray instances.
|
||
|
area = area.dtype.type(area)
|
||
|
return area
|
||
|
|
||
|
|
||
|
def average_precision_score(y_true, y_score, average="macro",
|
||
|
sample_weight=None):
|
||
|
"""Compute average precision (AP) from prediction scores
|
||
|
|
||
|
AP summarizes a precision-recall curve as the weighted mean of precisions
|
||
|
achieved at each threshold, with the increase in recall from the previous
|
||
|
threshold used as the weight:
|
||
|
|
||
|
.. math::
|
||
|
\\text{AP} = \\sum_n (R_n - R_{n-1}) P_n
|
||
|
|
||
|
where :math:`P_n` and :math:`R_n` are the precision and recall at the nth
|
||
|
threshold [1]_. This implementation is not interpolated and is different
|
||
|
from computing the area under the precision-recall curve with the
|
||
|
trapezoidal rule, which uses linear interpolation and can be too
|
||
|
optimistic.
|
||
|
|
||
|
Note: this implementation is restricted to the binary classification task
|
||
|
or multilabel classification task.
|
||
|
|
||
|
Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
y_true : array, shape = [n_samples] or [n_samples, n_classes]
|
||
|
True binary labels in binary label indicators.
|
||
|
|
||
|
y_score : array, shape = [n_samples] or [n_samples, n_classes]
|
||
|
Target scores, can either be probability estimates of the positive
|
||
|
class, confidence values, or non-thresholded measure of decisions
|
||
|
(as returned by "decision_function" on some classifiers).
|
||
|
|
||
|
average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
|
||
|
If ``None``, the scores for each class are returned. Otherwise,
|
||
|
this determines the type of averaging performed on the data:
|
||
|
|
||
|
``'micro'``:
|
||
|
Calculate metrics globally by considering each element of the label
|
||
|
indicator matrix as a label.
|
||
|
``'macro'``:
|
||
|
Calculate metrics for each label, and find their unweighted
|
||
|
mean. This does not take label imbalance into account.
|
||
|
``'weighted'``:
|
||
|
Calculate metrics for each label, and find their average, weighted
|
||
|
by support (the number of true instances for each label).
|
||
|
``'samples'``:
|
||
|
Calculate metrics for each instance, and find their average.
|
||
|
|
||
|
sample_weight : array-like of shape = [n_samples], optional
|
||
|
Sample weights.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
average_precision : float
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] `Wikipedia entry for the Average precision
|
||
|
<http://en.wikipedia.org/w/index.php?title=Information_retrieval&
|
||
|
oldid=793358396#Average_precision>`_
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
roc_auc_score : Compute the area under the ROC curve
|
||
|
|
||
|
precision_recall_curve :
|
||
|
Compute precision-recall pairs for different probability thresholds
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> import numpy as np
|
||
|
>>> from sklearn.metrics import average_precision_score
|
||
|
>>> y_true = np.array([0, 0, 1, 1])
|
||
|
>>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
|
||
|
>>> average_precision_score(y_true, y_scores) # doctest: +ELLIPSIS
|
||
|
0.83...
|
||
|
|
||
|
"""
|
||
|
def _binary_uninterpolated_average_precision(
|
||
|
y_true, y_score, sample_weight=None):
|
||
|
precision, recall, thresholds = precision_recall_curve(
|
||
|
y_true, y_score, sample_weight=sample_weight)
|
||
|
# Return the step function integral
|
||
|
# The following works because the last entry of precision is
|
||
|
# guaranteed to be 1, as returned by precision_recall_curve
|
||
|
return -np.sum(np.diff(recall) * np.array(precision)[:-1])
|
||
|
|
||
|
return _average_binary_score(_binary_uninterpolated_average_precision,
|
||
|
y_true, y_score, average,
|
||
|
sample_weight=sample_weight)
|
||
|
|
||
|
|
||
|
|
||
|
def roc_auc_score(y_true, y_score, average="macro", sample_weight=None):
|
||
|
"""Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
|
||
|
from prediction scores.
|
||
|
|
||
|
Note: this implementation is restricted to the binary classification task
|
||
|
or multilabel classification task in label indicator format.
|
||
|
|
||
|
Read more in the :ref:`User Guide <roc_metrics>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
y_true : array, shape = [n_samples] or [n_samples, n_classes]
|
||
|
True binary labels in binary label indicators.
|
||
|
|
||
|
y_score : array, shape = [n_samples] or [n_samples, n_classes]
|
||
|
Target scores, can either be probability estimates of the positive
|
||
|
class, confidence values, or non-thresholded measure of decisions
|
||
|
(as returned by "decision_function" on some classifiers).
|
||
|
|
||
|
average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
|
||
|
If ``None``, the scores for each class are returned. Otherwise,
|
||
|
this determines the type of averaging performed on the data:
|
||
|
|
||
|
``'micro'``:
|
||
|
Calculate metrics globally by considering each element of the label
|
||
|
indicator matrix as a label.
|
||
|
``'macro'``:
|
||
|
Calculate metrics for each label, and find their unweighted
|
||
|
mean. This does not take label imbalance into account.
|
||
|
``'weighted'``:
|
||
|
Calculate metrics for each label, and find their average, weighted
|
||
|
by support (the number of true instances for each label).
|
||
|
``'samples'``:
|
||
|
Calculate metrics for each instance, and find their average.
|
||
|
|
||
|
sample_weight : array-like of shape = [n_samples], optional
|
||
|
Sample weights.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
auc : float
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] `Wikipedia entry for the Receiver operating characteristic
|
||
|
<https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
average_precision_score : Area under the precision-recall curve
|
||
|
|
||
|
roc_curve : Compute Receiver operating characteristic (ROC) curve
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> import numpy as np
|
||
|
>>> from sklearn.metrics import roc_auc_score
|
||
|
>>> y_true = np.array([0, 0, 1, 1])
|
||
|
>>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
|
||
|
>>> roc_auc_score(y_true, y_scores)
|
||
|
0.75
|
||
|
|
||
|
"""
|
||
|
def _binary_roc_auc_score(y_true, y_score, sample_weight=None):
|
||
|
if len(np.unique(y_true)) != 2:
|
||
|
raise ValueError("Only one class present in y_true. ROC AUC score "
|
||
|
"is not defined in that case.")
|
||
|
|
||
|
fpr, tpr, tresholds = roc_curve(y_true, y_score,
|
||
|
sample_weight=sample_weight)
|
||
|
return auc(fpr, tpr, reorder=True)
|
||
|
|
||
|
return _average_binary_score(
|
||
|
_binary_roc_auc_score, y_true, y_score, average,
|
||
|
sample_weight=sample_weight)
|
||
|
|
||
|
|
||
|
def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
|
||
|
"""Calculate true and false positives per binary classification threshold.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
y_true : array, shape = [n_samples]
|
||
|
True targets of binary classification
|
||
|
|
||
|
y_score : array, shape = [n_samples]
|
||
|
Estimated probabilities or decision function
|
||
|
|
||
|
pos_label : int or str, default=None
|
||
|
The label of the positive class
|
||
|
|
||
|
sample_weight : array-like of shape = [n_samples], optional
|
||
|
Sample weights.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
fps : array, shape = [n_thresholds]
|
||
|
A count of false positives, at index i being the number of negative
|
||
|
samples assigned a score >= thresholds[i]. The total number of
|
||
|
negative samples is equal to fps[-1] (thus true negatives are given by
|
||
|
fps[-1] - fps).
|
||
|
|
||
|
tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
|
||
|
An increasing count of true positives, at index i being the number
|
||
|
of positive samples assigned a score >= thresholds[i]. The total
|
||
|
number of positive samples is equal to tps[-1] (thus false negatives
|
||
|
are given by tps[-1] - tps).
|
||
|
|
||
|
thresholds : array, shape = [n_thresholds]
|
||
|
Decreasing score values.
|
||
|
"""
|
||
|
# Check to make sure y_true is valid
|
||
|
y_type = type_of_target(y_true)
|
||
|
if not (y_type == "binary" or
|
||
|
(y_type == "multiclass" and pos_label is not None)):
|
||
|
raise ValueError("{0} format is not supported".format(y_type))
|
||
|
|
||
|
check_consistent_length(y_true, y_score, sample_weight)
|
||
|
y_true = column_or_1d(y_true)
|
||
|
y_score = column_or_1d(y_score)
|
||
|
assert_all_finite(y_true)
|
||
|
assert_all_finite(y_score)
|
||
|
|
||
|
if sample_weight is not None:
|
||
|
sample_weight = column_or_1d(sample_weight)
|
||
|
|
||
|
# ensure binary classification if pos_label is not specified
|
||
|
classes = np.unique(y_true)
|
||
|
if (pos_label is None and
|
||
|
not (np.array_equal(classes, [0, 1]) or
|
||
|
np.array_equal(classes, [-1, 1]) or
|
||
|
np.array_equal(classes, [0]) or
|
||
|
np.array_equal(classes, [-1]) or
|
||
|
np.array_equal(classes, [1]))):
|
||
|
raise ValueError("Data is not binary and pos_label is not specified")
|
||
|
elif pos_label is None:
|
||
|
pos_label = 1.
|
||
|
|
||
|
# make y_true a boolean vector
|
||
|
y_true = (y_true == pos_label)
|
||
|
|
||
|
# sort scores and corresponding truth values
|
||
|
desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
|
||
|
y_score = y_score[desc_score_indices]
|
||
|
y_true = y_true[desc_score_indices]
|
||
|
if sample_weight is not None:
|
||
|
weight = sample_weight[desc_score_indices]
|
||
|
else:
|
||
|
weight = 1.
|
||
|
|
||
|
# y_score typically has many tied values. Here we extract
|
||
|
# the indices associated with the distinct values. We also
|
||
|
# concatenate a value for the end of the curve.
|
||
|
distinct_value_indices = np.where(np.diff(y_score))[0]
|
||
|
threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
|
||
|
|
||
|
# accumulate the true positives with decreasing threshold
|
||
|
tps = stable_cumsum(y_true * weight)[threshold_idxs]
|
||
|
if sample_weight is not None:
|
||
|
fps = stable_cumsum(weight)[threshold_idxs] - tps
|
||
|
else:
|
||
|
fps = 1 + threshold_idxs - tps
|
||
|
return fps, tps, y_score[threshold_idxs]
|
||
|
|
||
|
|
||
|
def precision_recall_curve(y_true, probas_pred, pos_label=None,
|
||
|
sample_weight=None):
|
||
|
"""Compute precision-recall pairs for different probability thresholds
|
||
|
|
||
|
Note: this implementation is restricted to the binary classification task.
|
||
|
|
||
|
The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
|
||
|
true positives and ``fp`` the number of false positives. The precision is
|
||
|
intuitively the ability of the classifier not to label as positive a sample
|
||
|
that is negative.
|
||
|
|
||
|
The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
|
||
|
true positives and ``fn`` the number of false negatives. The recall is
|
||
|
intuitively the ability of the classifier to find all the positive samples.
|
||
|
|
||
|
The last precision and recall values are 1. and 0. respectively and do not
|
||
|
have a corresponding threshold. This ensures that the graph starts on the
|
||
|
x axis.
|
||
|
|
||
|
Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
y_true : array, shape = [n_samples]
|
||
|
True targets of binary classification in range {-1, 1} or {0, 1}.
|
||
|
|
||
|
probas_pred : array, shape = [n_samples]
|
||
|
Estimated probabilities or decision function.
|
||
|
|
||
|
pos_label : int or str, default=None
|
||
|
The label of the positive class
|
||
|
|
||
|
sample_weight : array-like of shape = [n_samples], optional
|
||
|
Sample weights.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
precision : array, shape = [n_thresholds + 1]
|
||
|
Precision values such that element i is the precision of
|
||
|
predictions with score >= thresholds[i] and the last element is 1.
|
||
|
|
||
|
recall : array, shape = [n_thresholds + 1]
|
||
|
Decreasing recall values such that element i is the recall of
|
||
|
predictions with score >= thresholds[i] and the last element is 0.
|
||
|
|
||
|
thresholds : array, shape = [n_thresholds <= len(np.unique(probas_pred))]
|
||
|
Increasing thresholds on the decision function used to compute
|
||
|
precision and recall.
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
average_precision_score : Compute average precision from prediction scores
|
||
|
|
||
|
roc_curve : Compute Receiver operating characteristic (ROC) curve
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> import numpy as np
|
||
|
>>> from sklearn.metrics import precision_recall_curve
|
||
|
>>> y_true = np.array([0, 0, 1, 1])
|
||
|
>>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
|
||
|
>>> precision, recall, thresholds = precision_recall_curve(
|
||
|
... y_true, y_scores)
|
||
|
>>> precision # doctest: +ELLIPSIS
|
||
|
array([ 0.66..., 0.5 , 1. , 1. ])
|
||
|
>>> recall
|
||
|
array([ 1. , 0.5, 0.5, 0. ])
|
||
|
>>> thresholds
|
||
|
array([ 0.35, 0.4 , 0.8 ])
|
||
|
|
||
|
"""
|
||
|
fps, tps, thresholds = _binary_clf_curve(y_true, probas_pred,
|
||
|
pos_label=pos_label,
|
||
|
sample_weight=sample_weight)
|
||
|
|
||
|
precision = tps / (tps + fps)
|
||
|
recall = tps / tps[-1]
|
||
|
|
||
|
# stop when full recall attained
|
||
|
# and reverse the outputs so recall is decreasing
|
||
|
last_ind = tps.searchsorted(tps[-1])
|
||
|
sl = slice(last_ind, None, -1)
|
||
|
return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl]
|
||
|
|
||
|
|
||
|
def roc_curve(y_true, y_score, pos_label=None, sample_weight=None,
|
||
|
drop_intermediate=True):
|
||
|
"""Compute Receiver operating characteristic (ROC)
|
||
|
|
||
|
Note: this implementation is restricted to the binary classification task.
|
||
|
|
||
|
Read more in the :ref:`User Guide <roc_metrics>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
|
||
|
y_true : array, shape = [n_samples]
|
||
|
True binary labels in range {0, 1} or {-1, 1}. If labels are not
|
||
|
binary, pos_label should be explicitly given.
|
||
|
|
||
|
y_score : array, shape = [n_samples]
|
||
|
Target scores, can either be probability estimates of the positive
|
||
|
class, confidence values, or non-thresholded measure of decisions
|
||
|
(as returned by "decision_function" on some classifiers).
|
||
|
|
||
|
pos_label : int or str, default=None
|
||
|
Label considered as positive and others are considered negative.
|
||
|
|
||
|
sample_weight : array-like of shape = [n_samples], optional
|
||
|
Sample weights.
|
||
|
|
||
|
drop_intermediate : boolean, optional (default=True)
|
||
|
Whether to drop some suboptimal thresholds which would not appear
|
||
|
on a plotted ROC curve. This is useful in order to create lighter
|
||
|
ROC curves.
|
||
|
|
||
|
.. versionadded:: 0.17
|
||
|
parameter *drop_intermediate*.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
fpr : array, shape = [>2]
|
||
|
Increasing false positive rates such that element i is the false
|
||
|
positive rate of predictions with score >= thresholds[i].
|
||
|
|
||
|
tpr : array, shape = [>2]
|
||
|
Increasing true positive rates such that element i is the true
|
||
|
positive rate of predictions with score >= thresholds[i].
|
||
|
|
||
|
thresholds : array, shape = [n_thresholds]
|
||
|
Decreasing thresholds on the decision function used to compute
|
||
|
fpr and tpr. `thresholds[0]` represents no instances being predicted
|
||
|
and is arbitrarily set to `max(y_score) + 1`.
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
roc_auc_score : Compute the area under the ROC curve
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
Since the thresholds are sorted from low to high values, they
|
||
|
are reversed upon returning them to ensure they correspond to both ``fpr``
|
||
|
and ``tpr``, which are sorted in reversed order during their calculation.
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] `Wikipedia entry for the Receiver operating characteristic
|
||
|
<https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
|
||
|
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> import numpy as np
|
||
|
>>> from sklearn import metrics
|
||
|
>>> y = np.array([1, 1, 2, 2])
|
||
|
>>> scores = np.array([0.1, 0.4, 0.35, 0.8])
|
||
|
>>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)
|
||
|
>>> fpr
|
||
|
array([ 0. , 0.5, 0.5, 1. ])
|
||
|
>>> tpr
|
||
|
array([ 0.5, 0.5, 1. , 1. ])
|
||
|
>>> thresholds
|
||
|
array([ 0.8 , 0.4 , 0.35, 0.1 ])
|
||
|
|
||
|
"""
|
||
|
fps, tps, thresholds = _binary_clf_curve(
|
||
|
y_true, y_score, pos_label=pos_label, sample_weight=sample_weight)
|
||
|
|
||
|
# Attempt to drop thresholds corresponding to points in between and
|
||
|
# collinear with other points. These are always suboptimal and do not
|
||
|
# appear on a plotted ROC curve (and thus do not affect the AUC).
|
||
|
# Here np.diff(_, 2) is used as a "second derivative" to tell if there
|
||
|
# is a corner at the point. Both fps and tps must be tested to handle
|
||
|
# thresholds with multiple data points (which are combined in
|
||
|
# _binary_clf_curve). This keeps all cases where the point should be kept,
|
||
|
# but does not drop more complicated cases like fps = [1, 3, 7],
|
||
|
# tps = [1, 2, 4]; there is no harm in keeping too many thresholds.
|
||
|
if drop_intermediate and len(fps) > 2:
|
||
|
optimal_idxs = np.where(np.r_[True,
|
||
|
np.logical_or(np.diff(fps, 2),
|
||
|
np.diff(tps, 2)),
|
||
|
True])[0]
|
||
|
fps = fps[optimal_idxs]
|
||
|
tps = tps[optimal_idxs]
|
||
|
thresholds = thresholds[optimal_idxs]
|
||
|
|
||
|
if tps.size == 0 or fps[0] != 0:
|
||
|
# Add an extra threshold position if necessary
|
||
|
tps = np.r_[0, tps]
|
||
|
fps = np.r_[0, fps]
|
||
|
thresholds = np.r_[thresholds[0] + 1, thresholds]
|
||
|
|
||
|
if fps[-1] <= 0:
|
||
|
warnings.warn("No negative samples in y_true, "
|
||
|
"false positive value should be meaningless",
|
||
|
UndefinedMetricWarning)
|
||
|
fpr = np.repeat(np.nan, fps.shape)
|
||
|
else:
|
||
|
fpr = fps / fps[-1]
|
||
|
|
||
|
if tps[-1] <= 0:
|
||
|
warnings.warn("No positive samples in y_true, "
|
||
|
"true positive value should be meaningless",
|
||
|
UndefinedMetricWarning)
|
||
|
tpr = np.repeat(np.nan, tps.shape)
|
||
|
else:
|
||
|
tpr = tps / tps[-1]
|
||
|
|
||
|
return fpr, tpr, thresholds
|
||
|
|
||
|
|
||
|
def label_ranking_average_precision_score(y_true, y_score):
|
||
|
"""Compute ranking-based average precision
|
||
|
|
||
|
Label ranking average precision (LRAP) is the average over each ground
|
||
|
truth label assigned to each sample, of the ratio of true vs. total
|
||
|
labels with lower score.
|
||
|
|
||
|
This metric is used in multilabel ranking problem, where the goal
|
||
|
is to give better rank to the labels associated to each sample.
|
||
|
|
||
|
The obtained score is always strictly greater than 0 and
|
||
|
the best value is 1.
|
||
|
|
||
|
Read more in the :ref:`User Guide <label_ranking_average_precision>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
y_true : array or sparse matrix, shape = [n_samples, n_labels]
|
||
|
True binary labels in binary indicator format.
|
||
|
|
||
|
y_score : array, shape = [n_samples, n_labels]
|
||
|
Target scores, can either be probability estimates of the positive
|
||
|
class, confidence values, or non-thresholded measure of decisions
|
||
|
(as returned by "decision_function" on some classifiers).
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
score : float
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> import numpy as np
|
||
|
>>> from sklearn.metrics import label_ranking_average_precision_score
|
||
|
>>> y_true = np.array([[1, 0, 0], [0, 0, 1]])
|
||
|
>>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]])
|
||
|
>>> label_ranking_average_precision_score(y_true, y_score) \
|
||
|
# doctest: +ELLIPSIS
|
||
|
0.416...
|
||
|
|
||
|
"""
|
||
|
check_consistent_length(y_true, y_score)
|
||
|
y_true = check_array(y_true, ensure_2d=False)
|
||
|
y_score = check_array(y_score, ensure_2d=False)
|
||
|
|
||
|
if y_true.shape != y_score.shape:
|
||
|
raise ValueError("y_true and y_score have different shape")
|
||
|
|
||
|
# Handle badly formatted array and the degenerate case with one label
|
||
|
y_type = type_of_target(y_true)
|
||
|
if (y_type != "multilabel-indicator" and
|
||
|
not (y_type == "binary" and y_true.ndim == 2)):
|
||
|
raise ValueError("{0} format is not supported".format(y_type))
|
||
|
|
||
|
y_true = csr_matrix(y_true)
|
||
|
y_score = -y_score
|
||
|
|
||
|
n_samples, n_labels = y_true.shape
|
||
|
|
||
|
out = 0.
|
||
|
for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])):
|
||
|
relevant = y_true.indices[start:stop]
|
||
|
|
||
|
if (relevant.size == 0 or relevant.size == n_labels):
|
||
|
# If all labels are relevant or unrelevant, the score is also
|
||
|
# equal to 1. The label ranking has no meaning.
|
||
|
out += 1.
|
||
|
continue
|
||
|
|
||
|
scores_i = y_score[i]
|
||
|
rank = rankdata(scores_i, 'max')[relevant]
|
||
|
L = rankdata(scores_i[relevant], 'max')
|
||
|
out += (L / rank).mean()
|
||
|
|
||
|
return out / n_samples
|
||
|
|
||
|
|
||
|
def coverage_error(y_true, y_score, sample_weight=None):
|
||
|
"""Coverage error measure
|
||
|
|
||
|
Compute how far we need to go through the ranked scores to cover all
|
||
|
true labels. The best value is equal to the average number
|
||
|
of labels in ``y_true`` per sample.
|
||
|
|
||
|
Ties in ``y_scores`` are broken by giving maximal rank that would have
|
||
|
been assigned to all tied values.
|
||
|
|
||
|
Note: Our implementation's score is 1 greater than the one given in
|
||
|
Tsoumakas et al., 2010. This extends it to handle the degenerate case
|
||
|
in which an instance has 0 true labels.
|
||
|
|
||
|
Read more in the :ref:`User Guide <coverage_error>`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
y_true : array, shape = [n_samples, n_labels]
|
||
|
True binary labels in binary indicator format.
|
||
|
|
||
|
y_score : array, shape = [n_samples, n_labels]
|
||
|
Target scores, can either be probability estimates of the positive
|
||
|
class, confidence values, or non-thresholded measure of decisions
|
||
|
(as returned by "decision_function" on some classifiers).
|
||
|
|
||
|
sample_weight : array-like of shape = [n_samples], optional
|
||
|
Sample weights.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
coverage_error : float
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010).
|
||
|
Mining multi-label data. In Data mining and knowledge discovery
|
||
|
handbook (pp. 667-685). Springer US.
|
||
|
|
||
|
"""
|
||
|
y_true = check_array(y_true, ensure_2d=False)
|
||
|
y_score = check_array(y_score, ensure_2d=False)
|
||
|
check_consistent_length(y_true, y_score, sample_weight)
|
||
|
|
||
|
y_type = type_of_target(y_true)
|
||
|
if y_type != "multilabel-indicator":
|
||
|
raise ValueError("{0} format is not supported".format(y_type))
|
||
|
|
||
|
if y_true.shape != y_score.shape:
|
||
|
raise ValueError("y_true and y_score have different shape")
|
||
|
|
||
|
y_score_mask = np.ma.masked_array(y_score, mask=np.logical_not(y_true))
|
||
|
y_min_relevant = y_score_mask.min(axis=1).reshape((-1, 1))
|
||
|
coverage = (y_score >= y_min_relevant).sum(axis=1)
|
||
|
coverage = coverage.filled(0)
|
||
|
|
||
|
return np.average(coverage, weights=sample_weight)
|
||
|
|
||
|
|
||
|
def label_ranking_loss(y_true, y_score, sample_weight=None):
|
||
|
"""Compute Ranking loss measure
|
||
|
|
||
|
Compute the average number of label pairs that are incorrectly ordered
|
||
|
given y_score weighted by the size of the label set and the number of
|
||
|
labels not in the label set.
|
||
|
|
||
|
This is similar to the error set size, but weighted by the number of
|
||
|
relevant and irrelevant labels. The best performance is achieved with
|
||
|
a ranking loss of zero.
|
||
|
|
||
|
Read more in the :ref:`User Guide <label_ranking_loss>`.
|
||
|
|
||
|
.. versionadded:: 0.17
|
||
|
A function *label_ranking_loss*
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
y_true : array or sparse matrix, shape = [n_samples, n_labels]
|
||
|
True binary labels in binary indicator format.
|
||
|
|
||
|
y_score : array, shape = [n_samples, n_labels]
|
||
|
Target scores, can either be probability estimates of the positive
|
||
|
class, confidence values, or non-thresholded measure of decisions
|
||
|
(as returned by "decision_function" on some classifiers).
|
||
|
|
||
|
sample_weight : array-like of shape = [n_samples], optional
|
||
|
Sample weights.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
loss : float
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010).
|
||
|
Mining multi-label data. In Data mining and knowledge discovery
|
||
|
handbook (pp. 667-685). Springer US.
|
||
|
|
||
|
"""
|
||
|
y_true = check_array(y_true, ensure_2d=False, accept_sparse='csr')
|
||
|
y_score = check_array(y_score, ensure_2d=False)
|
||
|
check_consistent_length(y_true, y_score, sample_weight)
|
||
|
|
||
|
y_type = type_of_target(y_true)
|
||
|
if y_type not in ("multilabel-indicator",):
|
||
|
raise ValueError("{0} format is not supported".format(y_type))
|
||
|
|
||
|
if y_true.shape != y_score.shape:
|
||
|
raise ValueError("y_true and y_score have different shape")
|
||
|
|
||
|
n_samples, n_labels = y_true.shape
|
||
|
|
||
|
y_true = csr_matrix(y_true)
|
||
|
|
||
|
loss = np.zeros(n_samples)
|
||
|
for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])):
|
||
|
# Sort and bin the label scores
|
||
|
unique_scores, unique_inverse = np.unique(y_score[i],
|
||
|
return_inverse=True)
|
||
|
true_at_reversed_rank = np.bincount(
|
||
|
unique_inverse[y_true.indices[start:stop]],
|
||
|
minlength=len(unique_scores))
|
||
|
all_at_reversed_rank = np.bincount(unique_inverse,
|
||
|
minlength=len(unique_scores))
|
||
|
false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank
|
||
|
|
||
|
# if the scores are ordered, it's possible to count the number of
|
||
|
# incorrectly ordered paires in linear time by cumulatively counting
|
||
|
# how many false labels of a given score have a score higher than the
|
||
|
# accumulated true labels with lower score.
|
||
|
loss[i] = np.dot(true_at_reversed_rank.cumsum(),
|
||
|
false_at_reversed_rank)
|
||
|
|
||
|
n_positives = count_nonzero(y_true, axis=1)
|
||
|
with np.errstate(divide="ignore", invalid="ignore"):
|
||
|
loss /= ((n_labels - n_positives) * n_positives)
|
||
|
|
||
|
# When there is no positive or no negative labels, those values should
|
||
|
# be consider as correct, i.e. the ranking doesn't matter.
|
||
|
loss[np.logical_or(n_positives == 0, n_positives == n_labels)] = 0.
|
||
|
|
||
|
return np.average(loss, weights=sample_weight)
|