You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

583 lines
20 KiB

4 years ago
  1. """Calibration of predicted probabilities."""
  2. # Author: Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
  3. # Balazs Kegl <balazs.kegl@gmail.com>
  4. # Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
  5. # Mathieu Blondel <mathieu@mblondel.org>
  6. #
  7. # License: BSD 3 clause
  8. from __future__ import division
  9. import warnings
  10. from math import log
  11. import numpy as np
  12. from scipy.optimize import fmin_bfgs
  13. from sklearn.preprocessing import LabelEncoder
  14. from .base import BaseEstimator, ClassifierMixin, RegressorMixin, clone
  15. from .preprocessing import label_binarize, LabelBinarizer
  16. from .utils import check_X_y, check_array, indexable, column_or_1d
  17. from .utils.validation import check_is_fitted, check_consistent_length
  18. from .utils.fixes import signature
  19. from .isotonic import IsotonicRegression
  20. from .svm import LinearSVC
  21. from .model_selection import check_cv
  22. from .metrics.classification import _check_binary_probabilistic_predictions
  23. class CalibratedClassifierCV(BaseEstimator, ClassifierMixin):
  24. """Probability calibration with isotonic regression or sigmoid.
  25. With this class, the base_estimator is fit on the train set of the
  26. cross-validation generator and the test set is used for calibration.
  27. The probabilities for each of the folds are then averaged
  28. for prediction. In case that cv="prefit" is passed to __init__,
  29. it is assumed that base_estimator has been fitted already and all
  30. data is used for calibration. Note that data for fitting the
  31. classifier and for calibrating it must be disjoint.
  32. Read more in the :ref:`User Guide <calibration>`.
  33. Parameters
  34. ----------
  35. base_estimator : instance BaseEstimator
  36. The classifier whose output decision function needs to be calibrated
  37. to offer more accurate predict_proba outputs. If cv=prefit, the
  38. classifier must have been fit already on data.
  39. method : 'sigmoid' or 'isotonic'
  40. The method to use for calibration. Can be 'sigmoid' which
  41. corresponds to Platt's method or 'isotonic' which is a
  42. non-parametric approach. It is not advised to use isotonic calibration
  43. with too few calibration samples ``(<<1000)`` since it tends to
  44. overfit.
  45. Use sigmoids (Platt's calibration) in this case.
  46. cv : integer, cross-validation generator, iterable or "prefit", optional
  47. Determines the cross-validation splitting strategy.
  48. Possible inputs for cv are:
  49. - None, to use the default 3-fold cross-validation,
  50. - integer, to specify the number of folds.
  51. - An object to be used as a cross-validation generator.
  52. - An iterable yielding train/test splits.
  53. For integer/None inputs, if ``y`` is binary or multiclass,
  54. :class:`sklearn.model_selection.StratifiedKFold` is used. If ``y`` is
  55. neither binary nor multiclass, :class:`sklearn.model_selection.KFold`
  56. is used.
  57. Refer :ref:`User Guide <cross_validation>` for the various
  58. cross-validation strategies that can be used here.
  59. If "prefit" is passed, it is assumed that base_estimator has been
  60. fitted already and all data is used for calibration.
  61. .. versionchanged:: 0.20
  62. ``cv`` default value if None will change from 3-fold to 5-fold
  63. in v0.22.
  64. Attributes
  65. ----------
  66. classes_ : array, shape (n_classes)
  67. The class labels.
  68. calibrated_classifiers_ : list (len() equal to cv or 1 if cv == "prefit")
  69. The list of calibrated classifiers, one for each crossvalidation fold,
  70. which has been fitted on all but the validation fold and calibrated
  71. on the validation fold.
  72. References
  73. ----------
  74. .. [1] Obtaining calibrated probability estimates from decision trees
  75. and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001
  76. .. [2] Transforming Classifier Scores into Accurate Multiclass
  77. Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002)
  78. .. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to
  79. Regularized Likelihood Methods, J. Platt, (1999)
  80. .. [4] Predicting Good Probabilities with Supervised Learning,
  81. A. Niculescu-Mizil & R. Caruana, ICML 2005
  82. """
  83. def __init__(self, base_estimator=None, method='sigmoid', cv='warn'):
  84. self.base_estimator = base_estimator
  85. self.method = method
  86. self.cv = cv
  87. def fit(self, X, y, sample_weight=None):
  88. """Fit the calibrated model
  89. Parameters
  90. ----------
  91. X : array-like, shape (n_samples, n_features)
  92. Training data.
  93. y : array-like, shape (n_samples,)
  94. Target values.
  95. sample_weight : array-like, shape = [n_samples] or None
  96. Sample weights. If None, then samples are equally weighted.
  97. Returns
  98. -------
  99. self : object
  100. Returns an instance of self.
  101. """
  102. X, y = check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'],
  103. force_all_finite=False)
  104. X, y = indexable(X, y)
  105. le = LabelBinarizer().fit(y)
  106. self.classes_ = le.classes_
  107. # Check that each cross-validation fold can have at least one
  108. # example per class
  109. n_folds = self.cv if isinstance(self.cv, int) \
  110. else self.cv.n_folds if hasattr(self.cv, "n_folds") else None
  111. if n_folds and \
  112. np.any([np.sum(y == class_) < n_folds for class_ in
  113. self.classes_]):
  114. raise ValueError("Requesting %d-fold cross-validation but provided"
  115. " less than %d examples for at least one class."
  116. % (n_folds, n_folds))
  117. self.calibrated_classifiers_ = []
  118. if self.base_estimator is None:
  119. # we want all classifiers that don't expose a random_state
  120. # to be deterministic (and we don't want to expose this one).
  121. base_estimator = LinearSVC(random_state=0)
  122. else:
  123. base_estimator = self.base_estimator
  124. if self.cv == "prefit":
  125. calibrated_classifier = _CalibratedClassifier(
  126. base_estimator, method=self.method)
  127. if sample_weight is not None:
  128. calibrated_classifier.fit(X, y, sample_weight)
  129. else:
  130. calibrated_classifier.fit(X, y)
  131. self.calibrated_classifiers_.append(calibrated_classifier)
  132. else:
  133. cv = check_cv(self.cv, y, classifier=True)
  134. fit_parameters = signature(base_estimator.fit).parameters
  135. estimator_name = type(base_estimator).__name__
  136. if (sample_weight is not None
  137. and "sample_weight" not in fit_parameters):
  138. warnings.warn("%s does not support sample_weight. Samples"
  139. " weights are only used for the calibration"
  140. " itself." % estimator_name)
  141. base_estimator_sample_weight = None
  142. else:
  143. if sample_weight is not None:
  144. sample_weight = check_array(sample_weight, ensure_2d=False)
  145. check_consistent_length(y, sample_weight)
  146. base_estimator_sample_weight = sample_weight
  147. for train, test in cv.split(X, y):
  148. this_estimator = clone(base_estimator)
  149. if base_estimator_sample_weight is not None:
  150. this_estimator.fit(
  151. X[train], y[train],
  152. sample_weight=base_estimator_sample_weight[train])
  153. else:
  154. this_estimator.fit(X[train], y[train])
  155. calibrated_classifier = _CalibratedClassifier(
  156. this_estimator, method=self.method,
  157. classes=self.classes_)
  158. if sample_weight is not None:
  159. calibrated_classifier.fit(X[test], y[test],
  160. sample_weight[test])
  161. else:
  162. calibrated_classifier.fit(X[test], y[test])
  163. self.calibrated_classifiers_.append(calibrated_classifier)
  164. return self
  165. def predict_proba(self, X):
  166. """Posterior probabilities of classification
  167. This function returns posterior probabilities of classification
  168. according to each class on an array of test vectors X.
  169. Parameters
  170. ----------
  171. X : array-like, shape (n_samples, n_features)
  172. The samples.
  173. Returns
  174. -------
  175. C : array, shape (n_samples, n_classes)
  176. The predicted probas.
  177. """
  178. check_is_fitted(self, ["classes_", "calibrated_classifiers_"])
  179. X = check_array(X, accept_sparse=['csc', 'csr', 'coo'],
  180. force_all_finite=False)
  181. # Compute the arithmetic mean of the predictions of the calibrated
  182. # classifiers
  183. mean_proba = np.zeros((X.shape[0], len(self.classes_)))
  184. for calibrated_classifier in self.calibrated_classifiers_:
  185. proba = calibrated_classifier.predict_proba(X)
  186. mean_proba += proba
  187. mean_proba /= len(self.calibrated_classifiers_)
  188. return mean_proba
  189. def predict(self, X):
  190. """Predict the target of new samples. Can be different from the
  191. prediction of the uncalibrated classifier.
  192. Parameters
  193. ----------
  194. X : array-like, shape (n_samples, n_features)
  195. The samples.
  196. Returns
  197. -------
  198. C : array, shape (n_samples,)
  199. The predicted class.
  200. """
  201. check_is_fitted(self, ["classes_", "calibrated_classifiers_"])
  202. return self.classes_[np.argmax(self.predict_proba(X), axis=1)]
  203. class _CalibratedClassifier(object):
  204. """Probability calibration with isotonic regression or sigmoid.
  205. It assumes that base_estimator has already been fit, and trains the
  206. calibration on the input set of the fit function. Note that this class
  207. should not be used as an estimator directly. Use CalibratedClassifierCV
  208. with cv="prefit" instead.
  209. Parameters
  210. ----------
  211. base_estimator : instance BaseEstimator
  212. The classifier whose output decision function needs to be calibrated
  213. to offer more accurate predict_proba outputs. No default value since
  214. it has to be an already fitted estimator.
  215. method : 'sigmoid' | 'isotonic'
  216. The method to use for calibration. Can be 'sigmoid' which
  217. corresponds to Platt's method or 'isotonic' which is a
  218. non-parametric approach based on isotonic regression.
  219. classes : array-like, shape (n_classes,), optional
  220. Contains unique classes used to fit the base estimator.
  221. if None, then classes is extracted from the given target values
  222. in fit().
  223. See also
  224. --------
  225. CalibratedClassifierCV
  226. References
  227. ----------
  228. .. [1] Obtaining calibrated probability estimates from decision trees
  229. and naive Bayesian classifiers, B. Zadrozny & C. Elkan, ICML 2001
  230. .. [2] Transforming Classifier Scores into Accurate Multiclass
  231. Probability Estimates, B. Zadrozny & C. Elkan, (KDD 2002)
  232. .. [3] Probabilistic Outputs for Support Vector Machines and Comparisons to
  233. Regularized Likelihood Methods, J. Platt, (1999)
  234. .. [4] Predicting Good Probabilities with Supervised Learning,
  235. A. Niculescu-Mizil & R. Caruana, ICML 2005
  236. """
  237. def __init__(self, base_estimator, method='sigmoid', classes=None):
  238. self.base_estimator = base_estimator
  239. self.method = method
  240. self.classes = classes
  241. def _preproc(self, X):
  242. n_classes = len(self.classes_)
  243. if hasattr(self.base_estimator, "decision_function"):
  244. df = self.base_estimator.decision_function(X)
  245. if df.ndim == 1:
  246. df = df[:, np.newaxis]
  247. elif hasattr(self.base_estimator, "predict_proba"):
  248. df = self.base_estimator.predict_proba(X)
  249. if n_classes == 2:
  250. df = df[:, 1:]
  251. else:
  252. raise RuntimeError('classifier has no decision_function or '
  253. 'predict_proba method.')
  254. idx_pos_class = self.label_encoder_.\
  255. transform(self.base_estimator.classes_)
  256. return df, idx_pos_class
  257. def fit(self, X, y, sample_weight=None):
  258. """Calibrate the fitted model
  259. Parameters
  260. ----------
  261. X : array-like, shape (n_samples, n_features)
  262. Training data.
  263. y : array-like, shape (n_samples,)
  264. Target values.
  265. sample_weight : array-like, shape = [n_samples] or None
  266. Sample weights. If None, then samples are equally weighted.
  267. Returns
  268. -------
  269. self : object
  270. Returns an instance of self.
  271. """
  272. self.label_encoder_ = LabelEncoder()
  273. if self.classes is None:
  274. self.label_encoder_.fit(y)
  275. else:
  276. self.label_encoder_.fit(self.classes)
  277. self.classes_ = self.label_encoder_.classes_
  278. Y = label_binarize(y, self.classes_)
  279. df, idx_pos_class = self._preproc(X)
  280. self.calibrators_ = []
  281. for k, this_df in zip(idx_pos_class, df.T):
  282. if self.method == 'isotonic':
  283. calibrator = IsotonicRegression(out_of_bounds='clip')
  284. elif self.method == 'sigmoid':
  285. calibrator = _SigmoidCalibration()
  286. else:
  287. raise ValueError('method should be "sigmoid" or '
  288. '"isotonic". Got %s.' % self.method)
  289. calibrator.fit(this_df, Y[:, k], sample_weight)
  290. self.calibrators_.append(calibrator)
  291. return self
  292. def predict_proba(self, X):
  293. """Posterior probabilities of classification
  294. This function returns posterior probabilities of classification
  295. according to each class on an array of test vectors X.
  296. Parameters
  297. ----------
  298. X : array-like, shape (n_samples, n_features)
  299. The samples.
  300. Returns
  301. -------
  302. C : array, shape (n_samples, n_classes)
  303. The predicted probas. Can be exact zeros.
  304. """
  305. n_classes = len(self.classes_)
  306. proba = np.zeros((X.shape[0], n_classes))
  307. df, idx_pos_class = self._preproc(X)
  308. for k, this_df, calibrator in \
  309. zip(idx_pos_class, df.T, self.calibrators_):
  310. if n_classes == 2:
  311. k += 1
  312. proba[:, k] = calibrator.predict(this_df)
  313. # Normalize the probabilities
  314. if n_classes == 2:
  315. proba[:, 0] = 1. - proba[:, 1]
  316. else:
  317. proba /= np.sum(proba, axis=1)[:, np.newaxis]
  318. # XXX : for some reason all probas can be 0
  319. proba[np.isnan(proba)] = 1. / n_classes
  320. # Deal with cases where the predicted probability minimally exceeds 1.0
  321. proba[(1.0 < proba) & (proba <= 1.0 + 1e-5)] = 1.0
  322. return proba
  323. def _sigmoid_calibration(df, y, sample_weight=None):
  324. """Probability Calibration with sigmoid method (Platt 2000)
  325. Parameters
  326. ----------
  327. df : ndarray, shape (n_samples,)
  328. The decision function or predict proba for the samples.
  329. y : ndarray, shape (n_samples,)
  330. The targets.
  331. sample_weight : array-like, shape = [n_samples] or None
  332. Sample weights. If None, then samples are equally weighted.
  333. Returns
  334. -------
  335. a : float
  336. The slope.
  337. b : float
  338. The intercept.
  339. References
  340. ----------
  341. Platt, "Probabilistic Outputs for Support Vector Machines"
  342. """
  343. df = column_or_1d(df)
  344. y = column_or_1d(y)
  345. F = df # F follows Platt's notations
  346. tiny = np.finfo(np.float).tiny # to avoid division by 0 warning
  347. # Bayesian priors (see Platt end of section 2.2)
  348. prior0 = float(np.sum(y <= 0))
  349. prior1 = y.shape[0] - prior0
  350. T = np.zeros(y.shape)
  351. T[y > 0] = (prior1 + 1.) / (prior1 + 2.)
  352. T[y <= 0] = 1. / (prior0 + 2.)
  353. T1 = 1. - T
  354. def objective(AB):
  355. # From Platt (beginning of Section 2.2)
  356. E = np.exp(AB[0] * F + AB[1])
  357. P = 1. / (1. + E)
  358. l = -(T * np.log(P + tiny) + T1 * np.log(1. - P + tiny))
  359. if sample_weight is not None:
  360. return (sample_weight * l).sum()
  361. else:
  362. return l.sum()
  363. def grad(AB):
  364. # gradient of the objective function
  365. E = np.exp(AB[0] * F + AB[1])
  366. P = 1. / (1. + E)
  367. TEP_minus_T1P = P * (T * E - T1)
  368. if sample_weight is not None:
  369. TEP_minus_T1P *= sample_weight
  370. dA = np.dot(TEP_minus_T1P, F)
  371. dB = np.sum(TEP_minus_T1P)
  372. return np.array([dA, dB])
  373. AB0 = np.array([0., log((prior0 + 1.) / (prior1 + 1.))])
  374. AB_ = fmin_bfgs(objective, AB0, fprime=grad, disp=False)
  375. return AB_[0], AB_[1]
  376. class _SigmoidCalibration(BaseEstimator, RegressorMixin):
  377. """Sigmoid regression model.
  378. Attributes
  379. ----------
  380. a_ : float
  381. The slope.
  382. b_ : float
  383. The intercept.
  384. """
  385. def fit(self, X, y, sample_weight=None):
  386. """Fit the model using X, y as training data.
  387. Parameters
  388. ----------
  389. X : array-like, shape (n_samples,)
  390. Training data.
  391. y : array-like, shape (n_samples,)
  392. Training target.
  393. sample_weight : array-like, shape = [n_samples] or None
  394. Sample weights. If None, then samples are equally weighted.
  395. Returns
  396. -------
  397. self : object
  398. Returns an instance of self.
  399. """
  400. X = column_or_1d(X)
  401. y = column_or_1d(y)
  402. X, y = indexable(X, y)
  403. self.a_, self.b_ = _sigmoid_calibration(X, y, sample_weight)
  404. return self
  405. def predict(self, T):
  406. """Predict new data by linear interpolation.
  407. Parameters
  408. ----------
  409. T : array-like, shape (n_samples,)
  410. Data to predict from.
  411. Returns
  412. -------
  413. T_ : array, shape (n_samples,)
  414. The predicted data.
  415. """
  416. T = column_or_1d(T)
  417. return 1. / (1. + np.exp(self.a_ * T + self.b_))
  418. def calibration_curve(y_true, y_prob, normalize=False, n_bins=5):
  419. """Compute true and predicted probabilities for a calibration curve.
  420. The method assumes the inputs come from a binary classifier.
  421. Calibration curves may also be referred to as reliability diagrams.
  422. Read more in the :ref:`User Guide <calibration>`.
  423. Parameters
  424. ----------
  425. y_true : array, shape (n_samples,)
  426. True targets.
  427. y_prob : array, shape (n_samples,)
  428. Probabilities of the positive class.
  429. normalize : bool, optional, default=False
  430. Whether y_prob needs to be normalized into the bin [0, 1], i.e. is not
  431. a proper probability. If True, the smallest value in y_prob is mapped
  432. onto 0 and the largest one onto 1.
  433. n_bins : int
  434. Number of bins. A bigger number requires more data.
  435. Returns
  436. -------
  437. prob_true : array, shape (n_bins,)
  438. The true probability in each bin (fraction of positives).
  439. prob_pred : array, shape (n_bins,)
  440. The mean predicted probability in each bin.
  441. References
  442. ----------
  443. Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good
  444. Probabilities With Supervised Learning, in Proceedings of the 22nd
  445. International Conference on Machine Learning (ICML).
  446. See section 4 (Qualitative Analysis of Predictions).
  447. """
  448. y_true = column_or_1d(y_true)
  449. y_prob = column_or_1d(y_prob)
  450. if normalize: # Normalize predicted values into interval [0, 1]
  451. y_prob = (y_prob - y_prob.min()) / (y_prob.max() - y_prob.min())
  452. elif y_prob.min() < 0 or y_prob.max() > 1:
  453. raise ValueError("y_prob has values outside [0, 1] and normalize is "
  454. "set to False.")
  455. y_true = _check_binary_probabilistic_predictions(y_true, y_prob)
  456. bins = np.linspace(0., 1. + 1e-8, n_bins + 1)
  457. binids = np.digitize(y_prob, bins) - 1
  458. bin_sums = np.bincount(binids, weights=y_prob, minlength=len(bins))
  459. bin_true = np.bincount(binids, weights=y_true, minlength=len(bins))
  460. bin_total = np.bincount(binids, minlength=len(bins))
  461. nonzero = bin_total != 0
  462. prob_true = (bin_true[nonzero] / bin_total[nonzero])
  463. prob_pred = (bin_sums[nonzero] / bin_total[nonzero])
  464. return prob_true, prob_pred