You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

391 lines
13 KiB

4 years ago
  1. # Authors: Fabian Pedregosa <fabian@fseoane.net>
  2. # Alexandre Gramfort <alexandre.gramfort@inria.fr>
  3. # Nelle Varoquaux <nelle.varoquaux@gmail.com>
  4. # License: BSD 3 clause
  5. import numpy as np
  6. from scipy import interpolate
  7. from scipy.stats import spearmanr
  8. from .base import BaseEstimator, TransformerMixin, RegressorMixin
  9. from .utils import as_float_array, check_array, check_consistent_length
  10. from ._isotonic import _inplace_contiguous_isotonic_regression, _make_unique
  11. import warnings
  12. import math
  13. __all__ = ['check_increasing', 'isotonic_regression',
  14. 'IsotonicRegression']
  15. def check_increasing(x, y):
  16. """Determine whether y is monotonically correlated with x.
  17. y is found increasing or decreasing with respect to x based on a Spearman
  18. correlation test.
  19. Parameters
  20. ----------
  21. x : array-like, shape=(n_samples,)
  22. Training data.
  23. y : array-like, shape=(n_samples,)
  24. Training target.
  25. Returns
  26. -------
  27. increasing_bool : boolean
  28. Whether the relationship is increasing or decreasing.
  29. Notes
  30. -----
  31. The Spearman correlation coefficient is estimated from the data, and the
  32. sign of the resulting estimate is used as the result.
  33. In the event that the 95% confidence interval based on Fisher transform
  34. spans zero, a warning is raised.
  35. References
  36. ----------
  37. Fisher transformation. Wikipedia.
  38. https://en.wikipedia.org/wiki/Fisher_transformation
  39. """
  40. # Calculate Spearman rho estimate and set return accordingly.
  41. rho, _ = spearmanr(x, y)
  42. increasing_bool = rho >= 0
  43. # Run Fisher transform to get the rho CI, but handle rho=+/-1
  44. if rho not in [-1.0, 1.0] and len(x) > 3:
  45. F = 0.5 * math.log((1. + rho) / (1. - rho))
  46. F_se = 1 / math.sqrt(len(x) - 3)
  47. # Use a 95% CI, i.e., +/-1.96 S.E.
  48. # https://en.wikipedia.org/wiki/Fisher_transformation
  49. rho_0 = math.tanh(F - 1.96 * F_se)
  50. rho_1 = math.tanh(F + 1.96 * F_se)
  51. # Warn if the CI spans zero.
  52. if np.sign(rho_0) != np.sign(rho_1):
  53. warnings.warn("Confidence interval of the Spearman "
  54. "correlation coefficient spans zero. "
  55. "Determination of ``increasing`` may be "
  56. "suspect.")
  57. return increasing_bool
  58. def isotonic_regression(y, sample_weight=None, y_min=None, y_max=None,
  59. increasing=True):
  60. """Solve the isotonic regression model::
  61. min sum w[i] (y[i] - y_[i]) ** 2
  62. subject to y_min = y_[1] <= y_[2] ... <= y_[n] = y_max
  63. where:
  64. - y[i] are inputs (real numbers)
  65. - y_[i] are fitted
  66. - w[i] are optional strictly positive weights (default to 1.0)
  67. Read more in the :ref:`User Guide <isotonic>`.
  68. Parameters
  69. ----------
  70. y : iterable of floats
  71. The data.
  72. sample_weight : iterable of floats, optional, default: None
  73. Weights on each point of the regression.
  74. If None, weight is set to 1 (equal weights).
  75. y_min : optional, default: None
  76. If not None, set the lowest value of the fit to y_min.
  77. y_max : optional, default: None
  78. If not None, set the highest value of the fit to y_max.
  79. increasing : boolean, optional, default: True
  80. Whether to compute ``y_`` is increasing (if set to True) or decreasing
  81. (if set to False)
  82. Returns
  83. -------
  84. y_ : list of floats
  85. Isotonic fit of y.
  86. References
  87. ----------
  88. "Active set algorithms for isotonic regression; A unifying framework"
  89. by Michael J. Best and Nilotpal Chakravarti, section 3.
  90. """
  91. order = np.s_[:] if increasing else np.s_[::-1]
  92. y = np.array(y[order], dtype=np.float64)
  93. if sample_weight is None:
  94. sample_weight = np.ones(len(y), dtype=np.float64)
  95. else:
  96. sample_weight = np.array(sample_weight[order], dtype=np.float64)
  97. _inplace_contiguous_isotonic_regression(y, sample_weight)
  98. if y_min is not None or y_max is not None:
  99. # Older versions of np.clip don't accept None as a bound, so use np.inf
  100. if y_min is None:
  101. y_min = -np.inf
  102. if y_max is None:
  103. y_max = np.inf
  104. np.clip(y, y_min, y_max, y)
  105. return y[order]
  106. class IsotonicRegression(BaseEstimator, TransformerMixin, RegressorMixin):
  107. """Isotonic regression model.
  108. The isotonic regression optimization problem is defined by::
  109. min sum w_i (y[i] - y_[i]) ** 2
  110. subject to y_[i] <= y_[j] whenever X[i] <= X[j]
  111. and min(y_) = y_min, max(y_) = y_max
  112. where:
  113. - ``y[i]`` are inputs (real numbers)
  114. - ``y_[i]`` are fitted
  115. - ``X`` specifies the order.
  116. If ``X`` is non-decreasing then ``y_`` is non-decreasing.
  117. - ``w[i]`` are optional strictly positive weights (default to 1.0)
  118. Read more in the :ref:`User Guide <isotonic>`.
  119. Parameters
  120. ----------
  121. y_min : optional, default: None
  122. If not None, set the lowest value of the fit to y_min.
  123. y_max : optional, default: None
  124. If not None, set the highest value of the fit to y_max.
  125. increasing : boolean or string, optional, default: True
  126. If boolean, whether or not to fit the isotonic regression with y
  127. increasing or decreasing.
  128. The string value "auto" determines whether y should
  129. increase or decrease based on the Spearman correlation estimate's
  130. sign.
  131. out_of_bounds : string, optional, default: "nan"
  132. The ``out_of_bounds`` parameter handles how x-values outside of the
  133. training domain are handled. When set to "nan", predicted y-values
  134. will be NaN. When set to "clip", predicted y-values will be
  135. set to the value corresponding to the nearest train interval endpoint.
  136. When set to "raise", allow ``interp1d`` to throw ValueError.
  137. Attributes
  138. ----------
  139. X_min_ : float
  140. Minimum value of input array `X_` for left bound.
  141. X_max_ : float
  142. Maximum value of input array `X_` for right bound.
  143. f_ : function
  144. The stepwise interpolating function that covers the input domain ``X``.
  145. Notes
  146. -----
  147. Ties are broken using the secondary method from Leeuw, 1977.
  148. References
  149. ----------
  150. Isotonic Median Regression: A Linear Programming Approach
  151. Nilotpal Chakravarti
  152. Mathematics of Operations Research
  153. Vol. 14, No. 2 (May, 1989), pp. 303-308
  154. Isotone Optimization in R : Pool-Adjacent-Violators
  155. Algorithm (PAVA) and Active Set Methods
  156. Leeuw, Hornik, Mair
  157. Journal of Statistical Software 2009
  158. Correctness of Kruskal's algorithms for monotone regression with ties
  159. Leeuw, Psychometrica, 1977
  160. """
  161. def __init__(self, y_min=None, y_max=None, increasing=True,
  162. out_of_bounds='nan'):
  163. self.y_min = y_min
  164. self.y_max = y_max
  165. self.increasing = increasing
  166. self.out_of_bounds = out_of_bounds
  167. def _check_fit_data(self, X, y, sample_weight=None):
  168. if len(X.shape) != 1:
  169. raise ValueError("X should be a 1d array")
  170. def _build_f(self, X, y):
  171. """Build the f_ interp1d function."""
  172. # Handle the out_of_bounds argument by setting bounds_error
  173. if self.out_of_bounds not in ["raise", "nan", "clip"]:
  174. raise ValueError("The argument ``out_of_bounds`` must be in "
  175. "'nan', 'clip', 'raise'; got {0}"
  176. .format(self.out_of_bounds))
  177. bounds_error = self.out_of_bounds == "raise"
  178. if len(y) == 1:
  179. # single y, constant prediction
  180. self.f_ = lambda x: y.repeat(x.shape)
  181. else:
  182. self.f_ = interpolate.interp1d(X, y, kind='linear',
  183. bounds_error=bounds_error)
  184. def _build_y(self, X, y, sample_weight, trim_duplicates=True):
  185. """Build the y_ IsotonicRegression."""
  186. check_consistent_length(X, y, sample_weight)
  187. X, y = [check_array(x, ensure_2d=False) for x in [X, y]]
  188. y = as_float_array(y)
  189. self._check_fit_data(X, y, sample_weight)
  190. # Determine increasing if auto-determination requested
  191. if self.increasing == 'auto':
  192. self.increasing_ = check_increasing(X, y)
  193. else:
  194. self.increasing_ = self.increasing
  195. # If sample_weights is passed, removed zero-weight values and clean
  196. # order
  197. if sample_weight is not None:
  198. sample_weight = check_array(sample_weight, ensure_2d=False)
  199. mask = sample_weight > 0
  200. X, y, sample_weight = X[mask], y[mask], sample_weight[mask]
  201. else:
  202. sample_weight = np.ones(len(y))
  203. order = np.lexsort((y, X))
  204. X, y, sample_weight = [array[order].astype(np.float64, copy=False)
  205. for array in [X, y, sample_weight]]
  206. unique_X, unique_y, unique_sample_weight = _make_unique(
  207. X, y, sample_weight)
  208. # Store _X_ and _y_ to maintain backward compat during the deprecation
  209. # period of X_ and y_
  210. self._X_ = X = unique_X
  211. self._y_ = y = isotonic_regression(unique_y, unique_sample_weight,
  212. self.y_min, self.y_max,
  213. increasing=self.increasing_)
  214. # Handle the left and right bounds on X
  215. self.X_min_, self.X_max_ = np.min(X), np.max(X)
  216. if trim_duplicates:
  217. # Remove unnecessary points for faster prediction
  218. keep_data = np.ones((len(y),), dtype=bool)
  219. # Aside from the 1st and last point, remove points whose y values
  220. # are equal to both the point before and the point after it.
  221. keep_data[1:-1] = np.logical_or(
  222. np.not_equal(y[1:-1], y[:-2]),
  223. np.not_equal(y[1:-1], y[2:])
  224. )
  225. return X[keep_data], y[keep_data]
  226. else:
  227. # The ability to turn off trim_duplicates is only used to it make
  228. # easier to unit test that removing duplicates in y does not have
  229. # any impact the resulting interpolation function (besides
  230. # prediction speed).
  231. return X, y
  232. def fit(self, X, y, sample_weight=None):
  233. """Fit the model using X, y as training data.
  234. Parameters
  235. ----------
  236. X : array-like, shape=(n_samples,)
  237. Training data.
  238. y : array-like, shape=(n_samples,)
  239. Training target.
  240. sample_weight : array-like, shape=(n_samples,), optional, default: None
  241. Weights. If set to None, all weights will be set to 1 (equal
  242. weights).
  243. Returns
  244. -------
  245. self : object
  246. Returns an instance of self.
  247. Notes
  248. -----
  249. X is stored for future use, as `transform` needs X to interpolate
  250. new input data.
  251. """
  252. # Transform y by running the isotonic regression algorithm and
  253. # transform X accordingly.
  254. X, y = self._build_y(X, y, sample_weight)
  255. # It is necessary to store the non-redundant part of the training set
  256. # on the model to make it possible to support model persistence via
  257. # the pickle module as the object built by scipy.interp1d is not
  258. # picklable directly.
  259. self._necessary_X_, self._necessary_y_ = X, y
  260. # Build the interpolation function
  261. self._build_f(X, y)
  262. return self
  263. def transform(self, T):
  264. """Transform new data by linear interpolation
  265. Parameters
  266. ----------
  267. T : array-like, shape=(n_samples,)
  268. Data to transform.
  269. Returns
  270. -------
  271. T_ : array, shape=(n_samples,)
  272. The transformed data
  273. """
  274. T = as_float_array(T)
  275. if len(T.shape) != 1:
  276. raise ValueError("Isotonic regression input should be a 1d array")
  277. # Handle the out_of_bounds argument by clipping if needed
  278. if self.out_of_bounds not in ["raise", "nan", "clip"]:
  279. raise ValueError("The argument ``out_of_bounds`` must be in "
  280. "'nan', 'clip', 'raise'; got {0}"
  281. .format(self.out_of_bounds))
  282. if self.out_of_bounds == "clip":
  283. T = np.clip(T, self.X_min_, self.X_max_)
  284. return self.f_(T)
  285. def predict(self, T):
  286. """Predict new data by linear interpolation.
  287. Parameters
  288. ----------
  289. T : array-like, shape=(n_samples,)
  290. Data to transform.
  291. Returns
  292. -------
  293. T_ : array, shape=(n_samples,)
  294. Transformed data.
  295. """
  296. return self.transform(T)
  297. def __getstate__(self):
  298. """Pickle-protocol - return state of the estimator. """
  299. state = super(IsotonicRegression, self).__getstate__()
  300. # remove interpolation method
  301. state.pop('f_', None)
  302. return state
  303. def __setstate__(self, state):
  304. """Pickle-protocol - set state of the estimator.
  305. We need to rebuild the interpolation function.
  306. """
  307. super(IsotonicRegression, self).__setstate__(state)
  308. if hasattr(self, '_necessary_X_') and hasattr(self, '_necessary_y_'):
  309. self._build_f(self._necessary_X_, self._necessary_y_)