|
|
- # Authors: Fabian Pedregosa <fabian@fseoane.net>
- # Alexandre Gramfort <alexandre.gramfort@inria.fr>
- # Nelle Varoquaux <nelle.varoquaux@gmail.com>
- # License: BSD 3 clause
-
- import numpy as np
- from scipy import interpolate
- from scipy.stats import spearmanr
- from .base import BaseEstimator, TransformerMixin, RegressorMixin
- from .utils import as_float_array, check_array, check_consistent_length
- from ._isotonic import _inplace_contiguous_isotonic_regression, _make_unique
- import warnings
- import math
-
-
- __all__ = ['check_increasing', 'isotonic_regression',
- 'IsotonicRegression']
-
-
- def check_increasing(x, y):
- """Determine whether y is monotonically correlated with x.
-
- y is found increasing or decreasing with respect to x based on a Spearman
- correlation test.
-
- Parameters
- ----------
- x : array-like, shape=(n_samples,)
- Training data.
-
- y : array-like, shape=(n_samples,)
- Training target.
-
- Returns
- -------
- increasing_bool : boolean
- Whether the relationship is increasing or decreasing.
-
- Notes
- -----
- The Spearman correlation coefficient is estimated from the data, and the
- sign of the resulting estimate is used as the result.
-
- In the event that the 95% confidence interval based on Fisher transform
- spans zero, a warning is raised.
-
- References
- ----------
- Fisher transformation. Wikipedia.
- https://en.wikipedia.org/wiki/Fisher_transformation
- """
-
- # Calculate Spearman rho estimate and set return accordingly.
- rho, _ = spearmanr(x, y)
- increasing_bool = rho >= 0
-
- # Run Fisher transform to get the rho CI, but handle rho=+/-1
- if rho not in [-1.0, 1.0] and len(x) > 3:
- F = 0.5 * math.log((1. + rho) / (1. - rho))
- F_se = 1 / math.sqrt(len(x) - 3)
-
- # Use a 95% CI, i.e., +/-1.96 S.E.
- # https://en.wikipedia.org/wiki/Fisher_transformation
- rho_0 = math.tanh(F - 1.96 * F_se)
- rho_1 = math.tanh(F + 1.96 * F_se)
-
- # Warn if the CI spans zero.
- if np.sign(rho_0) != np.sign(rho_1):
- warnings.warn("Confidence interval of the Spearman "
- "correlation coefficient spans zero. "
- "Determination of ``increasing`` may be "
- "suspect.")
-
- return increasing_bool
-
-
- def isotonic_regression(y, sample_weight=None, y_min=None, y_max=None,
- increasing=True):
- """Solve the isotonic regression model::
-
- min sum w[i] (y[i] - y_[i]) ** 2
-
- subject to y_min = y_[1] <= y_[2] ... <= y_[n] = y_max
-
- where:
- - y[i] are inputs (real numbers)
- - y_[i] are fitted
- - w[i] are optional strictly positive weights (default to 1.0)
-
- Read more in the :ref:`User Guide <isotonic>`.
-
- Parameters
- ----------
- y : iterable of floats
- The data.
-
- sample_weight : iterable of floats, optional, default: None
- Weights on each point of the regression.
- If None, weight is set to 1 (equal weights).
-
- y_min : optional, default: None
- If not None, set the lowest value of the fit to y_min.
-
- y_max : optional, default: None
- If not None, set the highest value of the fit to y_max.
-
- increasing : boolean, optional, default: True
- Whether to compute ``y_`` is increasing (if set to True) or decreasing
- (if set to False)
-
- Returns
- -------
- y_ : list of floats
- Isotonic fit of y.
-
- References
- ----------
- "Active set algorithms for isotonic regression; A unifying framework"
- by Michael J. Best and Nilotpal Chakravarti, section 3.
- """
- order = np.s_[:] if increasing else np.s_[::-1]
- y = np.array(y[order], dtype=np.float64)
- if sample_weight is None:
- sample_weight = np.ones(len(y), dtype=np.float64)
- else:
- sample_weight = np.array(sample_weight[order], dtype=np.float64)
-
- _inplace_contiguous_isotonic_regression(y, sample_weight)
- if y_min is not None or y_max is not None:
- # Older versions of np.clip don't accept None as a bound, so use np.inf
- if y_min is None:
- y_min = -np.inf
- if y_max is None:
- y_max = np.inf
- np.clip(y, y_min, y_max, y)
- return y[order]
-
-
- class IsotonicRegression(BaseEstimator, TransformerMixin, RegressorMixin):
- """Isotonic regression model.
-
- The isotonic regression optimization problem is defined by::
-
- min sum w_i (y[i] - y_[i]) ** 2
-
- subject to y_[i] <= y_[j] whenever X[i] <= X[j]
- and min(y_) = y_min, max(y_) = y_max
-
- where:
- - ``y[i]`` are inputs (real numbers)
- - ``y_[i]`` are fitted
- - ``X`` specifies the order.
- If ``X`` is non-decreasing then ``y_`` is non-decreasing.
- - ``w[i]`` are optional strictly positive weights (default to 1.0)
-
- Read more in the :ref:`User Guide <isotonic>`.
-
- Parameters
- ----------
- y_min : optional, default: None
- If not None, set the lowest value of the fit to y_min.
-
- y_max : optional, default: None
- If not None, set the highest value of the fit to y_max.
-
- increasing : boolean or string, optional, default: True
- If boolean, whether or not to fit the isotonic regression with y
- increasing or decreasing.
-
- The string value "auto" determines whether y should
- increase or decrease based on the Spearman correlation estimate's
- sign.
-
- out_of_bounds : string, optional, default: "nan"
- The ``out_of_bounds`` parameter handles how x-values outside of the
- training domain are handled. When set to "nan", predicted y-values
- will be NaN. When set to "clip", predicted y-values will be
- set to the value corresponding to the nearest train interval endpoint.
- When set to "raise", allow ``interp1d`` to throw ValueError.
-
-
- Attributes
- ----------
- X_min_ : float
- Minimum value of input array `X_` for left bound.
-
- X_max_ : float
- Maximum value of input array `X_` for right bound.
-
- f_ : function
- The stepwise interpolating function that covers the input domain ``X``.
-
- Notes
- -----
- Ties are broken using the secondary method from Leeuw, 1977.
-
- References
- ----------
- Isotonic Median Regression: A Linear Programming Approach
- Nilotpal Chakravarti
- Mathematics of Operations Research
- Vol. 14, No. 2 (May, 1989), pp. 303-308
-
- Isotone Optimization in R : Pool-Adjacent-Violators
- Algorithm (PAVA) and Active Set Methods
- Leeuw, Hornik, Mair
- Journal of Statistical Software 2009
-
- Correctness of Kruskal's algorithms for monotone regression with ties
- Leeuw, Psychometrica, 1977
- """
- def __init__(self, y_min=None, y_max=None, increasing=True,
- out_of_bounds='nan'):
- self.y_min = y_min
- self.y_max = y_max
- self.increasing = increasing
- self.out_of_bounds = out_of_bounds
-
- def _check_fit_data(self, X, y, sample_weight=None):
- if len(X.shape) != 1:
- raise ValueError("X should be a 1d array")
-
- def _build_f(self, X, y):
- """Build the f_ interp1d function."""
-
- # Handle the out_of_bounds argument by setting bounds_error
- if self.out_of_bounds not in ["raise", "nan", "clip"]:
- raise ValueError("The argument ``out_of_bounds`` must be in "
- "'nan', 'clip', 'raise'; got {0}"
- .format(self.out_of_bounds))
-
- bounds_error = self.out_of_bounds == "raise"
- if len(y) == 1:
- # single y, constant prediction
- self.f_ = lambda x: y.repeat(x.shape)
- else:
- self.f_ = interpolate.interp1d(X, y, kind='linear',
- bounds_error=bounds_error)
-
- def _build_y(self, X, y, sample_weight, trim_duplicates=True):
- """Build the y_ IsotonicRegression."""
- check_consistent_length(X, y, sample_weight)
- X, y = [check_array(x, ensure_2d=False) for x in [X, y]]
-
- y = as_float_array(y)
- self._check_fit_data(X, y, sample_weight)
-
- # Determine increasing if auto-determination requested
- if self.increasing == 'auto':
- self.increasing_ = check_increasing(X, y)
- else:
- self.increasing_ = self.increasing
-
- # If sample_weights is passed, removed zero-weight values and clean
- # order
- if sample_weight is not None:
- sample_weight = check_array(sample_weight, ensure_2d=False)
- mask = sample_weight > 0
- X, y, sample_weight = X[mask], y[mask], sample_weight[mask]
- else:
- sample_weight = np.ones(len(y))
-
- order = np.lexsort((y, X))
- X, y, sample_weight = [array[order].astype(np.float64, copy=False)
- for array in [X, y, sample_weight]]
- unique_X, unique_y, unique_sample_weight = _make_unique(
- X, y, sample_weight)
-
- # Store _X_ and _y_ to maintain backward compat during the deprecation
- # period of X_ and y_
- self._X_ = X = unique_X
- self._y_ = y = isotonic_regression(unique_y, unique_sample_weight,
- self.y_min, self.y_max,
- increasing=self.increasing_)
-
- # Handle the left and right bounds on X
- self.X_min_, self.X_max_ = np.min(X), np.max(X)
-
- if trim_duplicates:
- # Remove unnecessary points for faster prediction
- keep_data = np.ones((len(y),), dtype=bool)
- # Aside from the 1st and last point, remove points whose y values
- # are equal to both the point before and the point after it.
- keep_data[1:-1] = np.logical_or(
- np.not_equal(y[1:-1], y[:-2]),
- np.not_equal(y[1:-1], y[2:])
- )
- return X[keep_data], y[keep_data]
- else:
- # The ability to turn off trim_duplicates is only used to it make
- # easier to unit test that removing duplicates in y does not have
- # any impact the resulting interpolation function (besides
- # prediction speed).
- return X, y
-
- def fit(self, X, y, sample_weight=None):
- """Fit the model using X, y as training data.
-
- Parameters
- ----------
- X : array-like, shape=(n_samples,)
- Training data.
-
- y : array-like, shape=(n_samples,)
- Training target.
-
- sample_weight : array-like, shape=(n_samples,), optional, default: None
- Weights. If set to None, all weights will be set to 1 (equal
- weights).
-
- Returns
- -------
- self : object
- Returns an instance of self.
-
- Notes
- -----
- X is stored for future use, as `transform` needs X to interpolate
- new input data.
- """
- # Transform y by running the isotonic regression algorithm and
- # transform X accordingly.
- X, y = self._build_y(X, y, sample_weight)
-
- # It is necessary to store the non-redundant part of the training set
- # on the model to make it possible to support model persistence via
- # the pickle module as the object built by scipy.interp1d is not
- # picklable directly.
- self._necessary_X_, self._necessary_y_ = X, y
-
- # Build the interpolation function
- self._build_f(X, y)
- return self
-
- def transform(self, T):
- """Transform new data by linear interpolation
-
- Parameters
- ----------
- T : array-like, shape=(n_samples,)
- Data to transform.
-
- Returns
- -------
- T_ : array, shape=(n_samples,)
- The transformed data
- """
- T = as_float_array(T)
- if len(T.shape) != 1:
- raise ValueError("Isotonic regression input should be a 1d array")
-
- # Handle the out_of_bounds argument by clipping if needed
- if self.out_of_bounds not in ["raise", "nan", "clip"]:
- raise ValueError("The argument ``out_of_bounds`` must be in "
- "'nan', 'clip', 'raise'; got {0}"
- .format(self.out_of_bounds))
-
- if self.out_of_bounds == "clip":
- T = np.clip(T, self.X_min_, self.X_max_)
- return self.f_(T)
-
- def predict(self, T):
- """Predict new data by linear interpolation.
-
- Parameters
- ----------
- T : array-like, shape=(n_samples,)
- Data to transform.
-
- Returns
- -------
- T_ : array, shape=(n_samples,)
- Transformed data.
- """
- return self.transform(T)
-
- def __getstate__(self):
- """Pickle-protocol - return state of the estimator. """
- state = super(IsotonicRegression, self).__getstate__()
- # remove interpolation method
- state.pop('f_', None)
- return state
-
- def __setstate__(self, state):
- """Pickle-protocol - set state of the estimator.
-
- We need to rebuild the interpolation function.
- """
- super(IsotonicRegression, self).__setstate__(state)
- if hasattr(self, '_necessary_X_') and hasattr(self, '_necessary_y_'):
- self._build_f(self._necessary_X_, self._necessary_y_)
|