# Author: Arnaud Joly, Joel Nothman, Hamzeh Alsalhi # # License: BSD 3 clause """ Multi-class / multi-label utility function ========================================== """ from __future__ import division from itertools import chain from scipy.sparse import issparse from scipy.sparse.base import spmatrix from scipy.sparse import dok_matrix from scipy.sparse import lil_matrix import numpy as np from ..externals.six import string_types from ..utils.fixes import _Sequence as Sequence from .validation import check_array def _unique_multiclass(y): if hasattr(y, '__array__'): return np.unique(np.asarray(y)) else: return set(y) def _unique_indicator(y): return np.arange(check_array(y, ['csr', 'csc', 'coo']).shape[1]) _FN_UNIQUE_LABELS = { 'binary': _unique_multiclass, 'multiclass': _unique_multiclass, 'multilabel-indicator': _unique_indicator, } def unique_labels(*ys): """Extract an ordered array of unique labels We don't allow: - mix of multilabel and multiclass (single label) targets - mix of label indicator matrix and anything else, because there are no explicit labels) - mix of label indicator matrices of different sizes - mix of string and integer labels At the moment, we also don't allow "multiclass-multioutput" input type. Parameters ---------- *ys : array-likes, Returns ------- out : numpy array of shape [n_unique_labels] An ordered array of unique labels. Examples -------- >>> from sklearn.utils.multiclass import unique_labels >>> unique_labels([3, 5, 5, 5, 7, 7]) array([3, 5, 7]) >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4]) array([1, 2, 3, 4]) >>> unique_labels([1, 2, 10], [5, 11]) array([ 1, 2, 5, 10, 11]) """ if not ys: raise ValueError('No argument has been passed.') # Check that we don't mix label format ys_types = set(type_of_target(x) for x in ys) if ys_types == set(["binary", "multiclass"]): ys_types = set(["multiclass"]) if len(ys_types) > 1: raise ValueError("Mix type of y not allowed, got types %s" % ys_types) label_type = ys_types.pop() # Check consistency for the indicator format if (label_type == "multilabel-indicator" and len(set(check_array(y, ['csr', 'csc', 'coo']).shape[1] for y in ys)) > 1): raise ValueError("Multi-label binary indicator input with " "different numbers of labels") # Get the unique set of labels _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None) if not _unique_labels: raise ValueError("Unknown label type: %s" % repr(ys)) ys_labels = set(chain.from_iterable(_unique_labels(y) for y in ys)) # Check that we don't mix string type with number type if (len(set(isinstance(label, string_types) for label in ys_labels)) > 1): raise ValueError("Mix of label input types (string and number)") return np.array(sorted(ys_labels)) def _is_integral_float(y): return y.dtype.kind == 'f' and np.all(y.astype(int) == y) def is_multilabel(y): """ Check if ``y`` is in a multilabel format. Parameters ---------- y : numpy array of shape [n_samples] Target values. Returns ------- out : bool, Return ``True``, if ``y`` is in a multilabel format, else ```False``. Examples -------- >>> import numpy as np >>> from sklearn.utils.multiclass import is_multilabel >>> is_multilabel([0, 1, 0, 1]) False >>> is_multilabel([[1], [0, 2], []]) False >>> is_multilabel(np.array([[1, 0], [0, 0]])) True >>> is_multilabel(np.array([[1], [0], [0]])) False >>> is_multilabel(np.array([[1, 0, 0]])) True """ if hasattr(y, '__array__'): y = np.asarray(y) if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1): return False if issparse(y): if isinstance(y, (dok_matrix, lil_matrix)): y = y.tocsr() return (len(y.data) == 0 or np.unique(y.data).size == 1 and (y.dtype.kind in 'biu' or # bool, int, uint _is_integral_float(np.unique(y.data)))) else: labels = np.unique(y) return len(labels) < 3 and (y.dtype.kind in 'biu' or # bool, int, uint _is_integral_float(labels)) def check_classification_targets(y): """Ensure that target y is of a non-regression type. Only the following target types (as defined in type_of_target) are allowed: 'binary', 'multiclass', 'multiclass-multioutput', 'multilabel-indicator', 'multilabel-sequences' Parameters ---------- y : array-like """ y_type = type_of_target(y) if y_type not in ['binary', 'multiclass', 'multiclass-multioutput', 'multilabel-indicator', 'multilabel-sequences']: raise ValueError("Unknown label type: %r" % y_type) def type_of_target(y): """Determine the type of data indicated by the target. Note that this type is the most specific type that can be inferred. For example: * ``binary`` is more specific but compatible with ``multiclass``. * ``multiclass`` of integers is more specific but compatible with ``continuous``. * ``multilabel-indicator`` is more specific but compatible with ``multiclass-multioutput``. Parameters ---------- y : array-like Returns ------- target_type : string One of: * 'continuous': `y` is an array-like of floats that are not all integers, and is 1d or a column vector. * 'continuous-multioutput': `y` is a 2d array of floats that are not all integers, and both dimensions are of size > 1. * 'binary': `y` contains <= 2 discrete values and is 1d or a column vector. * 'multiclass': `y` contains more than two discrete values, is not a sequence of sequences, and is 1d or a column vector. * 'multiclass-multioutput': `y` is a 2d array that contains more than two discrete values, is not a sequence of sequences, and both dimensions are of size > 1. * 'multilabel-indicator': `y` is a label indicator matrix, an array of two dimensions with at least two columns, and at most 2 unique values. * 'unknown': `y` is array-like but none of the above, such as a 3d array, sequence of sequences, or an array of non-sequence objects. Examples -------- >>> import numpy as np >>> type_of_target([0.1, 0.6]) 'continuous' >>> type_of_target([1, -1, -1, 1]) 'binary' >>> type_of_target(['a', 'b', 'a']) 'binary' >>> type_of_target([1.0, 2.0]) 'binary' >>> type_of_target([1, 0, 2]) 'multiclass' >>> type_of_target([1.0, 0.0, 3.0]) 'multiclass' >>> type_of_target(['a', 'b', 'c']) 'multiclass' >>> type_of_target(np.array([[1, 2], [3, 1]])) 'multiclass-multioutput' >>> type_of_target([[1, 2]]) 'multiclass-multioutput' >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]])) 'continuous-multioutput' >>> type_of_target(np.array([[0, 1], [1, 1]])) 'multilabel-indicator' """ valid = ((isinstance(y, (Sequence, spmatrix)) or hasattr(y, '__array__')) and not isinstance(y, string_types)) if not valid: raise ValueError('Expected array-like (array or non-string sequence), ' 'got %r' % y) sparseseries = (y.__class__.__name__ == 'SparseSeries') if sparseseries: raise ValueError("y cannot be class 'SparseSeries'.") if is_multilabel(y): return 'multilabel-indicator' try: y = np.asarray(y) except ValueError: # Known to fail in numpy 1.3 for array of arrays return 'unknown' # The old sequence of sequences format try: if (not hasattr(y[0], '__array__') and isinstance(y[0], Sequence) and not isinstance(y[0], string_types)): raise ValueError('You appear to be using a legacy multi-label data' ' representation. Sequence of sequences are no' ' longer supported; use a binary array or sparse' ' matrix instead.') except IndexError: pass # Invalid inputs if y.ndim > 2 or (y.dtype == object and len(y) and not isinstance(y.flat[0], string_types)): return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"] if y.ndim == 2 and y.shape[1] == 0: return 'unknown' # [[]] if y.ndim == 2 and y.shape[1] > 1: suffix = "-multioutput" # [[1, 2], [1, 2]] else: suffix = "" # [1, 2, 3] or [[1], [2], [3]] # check float and contains non-integer float values if y.dtype.kind == 'f' and np.any(y != y.astype(int)): # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] return 'continuous' + suffix if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1): return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] else: return 'binary' # [1, 2] or [["a"], ["b"]] def _check_partial_fit_first_call(clf, classes=None): """Private helper function for factorizing common classes param logic Estimators that implement the ``partial_fit`` API need to be provided with the list of possible classes at the first call to partial_fit. Subsequent calls to partial_fit should check that ``classes`` is still consistent with a previous value of ``clf.classes_`` when provided. This function returns True if it detects that this was the first call to ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also set on ``clf``. """ if getattr(clf, 'classes_', None) is None and classes is None: raise ValueError("classes must be passed on the first call " "to partial_fit.") elif classes is not None: if getattr(clf, 'classes_', None) is not None: if not np.array_equal(clf.classes_, unique_labels(classes)): raise ValueError( "`classes=%r` is not the same as on last call " "to partial_fit, was: %r" % (classes, clf.classes_)) else: # This is the first call to partial_fit clf.classes_ = unique_labels(classes) return True # classes is None and clf.classes_ has already previously been set: # nothing to do return False def class_distribution(y, sample_weight=None): """Compute class priors from multioutput-multiclass target data Parameters ---------- y : array like or sparse matrix of size (n_samples, n_outputs) The labels for each example. sample_weight : array-like of shape = (n_samples,), optional Sample weights. Returns ------- classes : list of size n_outputs of arrays of size (n_classes,) List of classes for each column. n_classes : list of integers of size n_outputs Number of classes in each column class_prior : list of size n_outputs of arrays of size (n_classes,) Class distribution of each column. """ classes = [] n_classes = [] class_prior = [] n_samples, n_outputs = y.shape if issparse(y): y = y.tocsc() y_nnz = np.diff(y.indptr) for k in range(n_outputs): col_nonzero = y.indices[y.indptr[k]:y.indptr[k + 1]] # separate sample weights for zero and non-zero elements if sample_weight is not None: nz_samp_weight = np.asarray(sample_weight)[col_nonzero] zeros_samp_weight_sum = (np.sum(sample_weight) - np.sum(nz_samp_weight)) else: nz_samp_weight = None zeros_samp_weight_sum = y.shape[0] - y_nnz[k] classes_k, y_k = np.unique(y.data[y.indptr[k]:y.indptr[k + 1]], return_inverse=True) class_prior_k = np.bincount(y_k, weights=nz_samp_weight) # An explicit zero was found, combine its weight with the weight # of the implicit zeros if 0 in classes_k: class_prior_k[classes_k == 0] += zeros_samp_weight_sum # If an there is an implicit zero and it is not in classes and # class_prior, make an entry for it if 0 not in classes_k and y_nnz[k] < y.shape[0]: classes_k = np.insert(classes_k, 0, 0) class_prior_k = np.insert(class_prior_k, 0, zeros_samp_weight_sum) classes.append(classes_k) n_classes.append(classes_k.shape[0]) class_prior.append(class_prior_k / class_prior_k.sum()) else: for k in range(n_outputs): classes_k, y_k = np.unique(y[:, k], return_inverse=True) classes.append(classes_k) n_classes.append(classes_k.shape[0]) class_prior_k = np.bincount(y_k, weights=sample_weight) class_prior.append(class_prior_k / class_prior_k.sum()) return (classes, n_classes, class_prior) def _ovr_decision_function(predictions, confidences, n_classes): """Compute a continuous, tie-breaking ovr decision function. It is important to include a continuous value, not only votes, to make computing AUC or calibration meaningful. Parameters ---------- predictions : array-like, shape (n_samples, n_classifiers) Predicted classes for each binary classifier. confidences : array-like, shape (n_samples, n_classifiers) Decision functions or predicted probabilities for positive class for each binary classifier. n_classes : int Number of classes. n_classifiers must be ``n_classes * (n_classes - 1 ) / 2`` """ n_samples = predictions.shape[0] votes = np.zeros((n_samples, n_classes)) sum_of_confidences = np.zeros((n_samples, n_classes)) k = 0 for i in range(n_classes): for j in range(i + 1, n_classes): sum_of_confidences[:, i] -= confidences[:, k] sum_of_confidences[:, j] += confidences[:, k] votes[predictions[:, k] == 0, i] += 1 votes[predictions[:, k] == 1, j] += 1 k += 1 max_confidences = sum_of_confidences.max() min_confidences = sum_of_confidences.min() if max_confidences == min_confidences: return votes # Scale the sum_of_confidences to (-0.5, 0.5) and add it with votes. # The motivation is to use confidence levels as a way to break ties in # the votes without switching any decision made based on a difference # of 1 vote. eps = np.finfo(sum_of_confidences.dtype).eps max_abs_confidence = max(abs(max_confidences), abs(min_confidences)) scale = (0.5 - eps) / max_abs_confidence return votes + sum_of_confidences * scale