import itertools import functools import operator import warnings from distutils.version import LooseVersion import numpy as np from pandas import compat from pandas._libs import tslib, lib from pandas.core.dtypes.common import ( _get_dtype, is_float, is_scalar, is_integer, is_complex, is_float_dtype, is_complex_dtype, is_integer_dtype, is_bool_dtype, is_object_dtype, is_numeric_dtype, is_datetime64_dtype, is_timedelta64_dtype, is_datetime_or_timedelta_dtype, is_int_or_datetime_dtype, is_any_int_dtype) from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype from pandas.core.config import get_option import pandas.core.common as com _BOTTLENECK_INSTALLED = False _MIN_BOTTLENECK_VERSION = '1.0.0' try: import bottleneck as bn ver = bn.__version__ _BOTTLENECK_INSTALLED = (LooseVersion(ver) >= LooseVersion(_MIN_BOTTLENECK_VERSION)) if not _BOTTLENECK_INSTALLED: warnings.warn( "The installed version of bottleneck {ver} is not supported " "in pandas and will be not be used\nThe minimum supported " "version is {min_ver}\n".format( ver=ver, min_ver=_MIN_BOTTLENECK_VERSION), UserWarning) except ImportError: # pragma: no cover pass _USE_BOTTLENECK = False def set_use_bottleneck(v=True): # set/unset to use bottleneck global _USE_BOTTLENECK if _BOTTLENECK_INSTALLED: _USE_BOTTLENECK = v set_use_bottleneck(get_option('compute.use_bottleneck')) class disallow(object): def __init__(self, *dtypes): super(disallow, self).__init__() self.dtypes = tuple(np.dtype(dtype).type for dtype in dtypes) def check(self, obj): return hasattr(obj, 'dtype') and issubclass(obj.dtype.type, self.dtypes) def __call__(self, f): @functools.wraps(f) def _f(*args, **kwargs): obj_iter = itertools.chain(args, compat.itervalues(kwargs)) if any(self.check(obj) for obj in obj_iter): msg = 'reduction operation {name!r} not allowed for this dtype' raise TypeError(msg.format(name=f.__name__.replace('nan', ''))) try: with np.errstate(invalid='ignore'): return f(*args, **kwargs) except ValueError as e: # we want to transform an object array # ValueError message to the more typical TypeError # e.g. this is normally a disallowed function on # object arrays that contain strings if is_object_dtype(args[0]): raise TypeError(e) raise return _f class bottleneck_switch(object): def __init__(self, **kwargs): self.kwargs = kwargs def __call__(self, alt): bn_name = alt.__name__ try: bn_func = getattr(bn, bn_name) except (AttributeError, NameError): # pragma: no cover bn_func = None @functools.wraps(alt) def f(values, axis=None, skipna=True, **kwds): if len(self.kwargs) > 0: for k, v in compat.iteritems(self.kwargs): if k not in kwds: kwds[k] = v try: if values.size == 0 and kwds.get('min_count') is None: # We are empty, returning NA for our type # Only applies for the default `min_count` of None # since that affects how empty arrays are handled. # TODO(GH-18976) update all the nanops methods to # correctly handle empty inputs and remove this check. # It *may* just be `var` return _na_for_min_count(values, axis) if (_USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name)): result = bn_func(values, axis=axis, **kwds) # prefer to treat inf/-inf as NA, but must compute the func # twice :( if _has_infs(result): result = alt(values, axis=axis, skipna=skipna, **kwds) else: result = alt(values, axis=axis, skipna=skipna, **kwds) except Exception: try: result = alt(values, axis=axis, skipna=skipna, **kwds) except ValueError as e: # we want to transform an object array # ValueError message to the more typical TypeError # e.g. this is normally a disallowed function on # object arrays that contain strings if is_object_dtype(values): raise TypeError(e) raise return result return f def _bn_ok_dtype(dt, name): # Bottleneck chokes on datetime64 if (not is_object_dtype(dt) and not is_datetime_or_timedelta_dtype(dt)): # GH 15507 # bottleneck does not properly upcast during the sum # so can overflow # GH 9422 # further we also want to preserve NaN when all elements # are NaN, unlinke bottleneck/numpy which consider this # to be 0 if name in ['nansum', 'nanprod']: return False return True return False def _has_infs(result): if isinstance(result, np.ndarray): if result.dtype == 'f8': return lib.has_infs_f8(result.ravel()) elif result.dtype == 'f4': return lib.has_infs_f4(result.ravel()) try: return np.isinf(result).any() except (TypeError, NotImplementedError): # if it doesn't support infs, then it can't have infs return False def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): """ return the correct fill value for the dtype of the values """ if fill_value is not None: return fill_value if _na_ok_dtype(dtype): if fill_value_typ is None: return np.nan else: if fill_value_typ == '+inf': return np.inf else: return -np.inf else: if fill_value_typ is None: return tslib.iNaT else: if fill_value_typ == '+inf': # need the max int here return _int64_max else: return tslib.iNaT def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ values = com._values_from_object(values) if isfinite: mask = _isfinite(values) else: mask = isna(values) dtype = values.dtype dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna: if copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = maybe_upcast_putmask(values, mask, fill_value) elif copy: values = values.copy() values = _view_if_needed(values) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max def _isfinite(values): if is_datetime_or_timedelta_dtype(values): return isna(values) if (is_complex_dtype(values) or is_float_dtype(values) or is_integer_dtype(values) or is_bool_dtype(values)): return ~np.isfinite(values) return ~np.isfinite(values.astype('float64')) def _na_ok_dtype(dtype): return not is_int_or_datetime_dtype(dtype) def _view_if_needed(values): if is_datetime_or_timedelta_dtype(values): return values.view(np.int64) return values def _wrap_results(result, dtype): """ wrap our results if needed """ if is_datetime64_dtype(dtype): if not isinstance(result, np.ndarray): result = tslib.Timestamp(result) else: result = result.view(dtype) elif is_timedelta64_dtype(dtype): if not isinstance(result, np.ndarray): # raise if we have a timedelta64[ns] which is too large if np.fabs(result) > _int64_max: raise ValueError("overflow in timedelta operation") result = tslib.Timedelta(result, unit='ns') else: result = result.astype('i8').view(dtype) return result def _na_for_min_count(values, axis): """Return the missing value for `values` Parameters ---------- values : ndarray axis : int or None axis for the reduction Returns ------- result : scalar or ndarray For 1-D values, returns a scalar of the correct missing type. For 2-D values, returns a 1-D array where each element is missing. """ # we either return np.nan or pd.NaT if is_numeric_dtype(values): values = values.astype('float64') fill_value = na_value_for_dtype(values.dtype) if values.ndim == 1: return fill_value else: result_shape = (values.shape[:axis] + values.shape[axis + 1:]) result = np.empty(result_shape, dtype=values.dtype) result.fill(fill_value) return result def nanany(values, axis=None, skipna=True): values, mask, dtype, _ = _get_values(values, skipna, False, copy=skipna) return values.any(axis) def nanall(values, axis=None, skipna=True): values, mask, dtype, _ = _get_values(values, skipna, True, copy=skipna) return values.all(axis) @disallow('M8') def nansum(values, axis=None, skipna=True, min_count=0): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask, min_count=min_count) return _wrap_results(the_sum, dtype) @disallow('M8') @bottleneck_switch() def nanmean(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) dtype_sum = dtype_max dtype_count = np.float64 if is_integer_dtype(dtype) or is_timedelta64_dtype(dtype): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype dtype_count = dtype count = _get_counts(mask, axis, dtype=dtype_count) the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) if axis is not None and getattr(the_sum, 'ndim', False): the_mean = the_sum / count ct_mask = count == 0 if ct_mask.any(): the_mean[ct_mask] = np.nan else: the_mean = the_sum / count if count > 0 else np.nan return _wrap_results(the_mean, dtype) @disallow('M8') @bottleneck_switch() def nanmedian(values, axis=None, skipna=True): def get_median(x): mask = notna(x) if not skipna and not mask.all(): return np.nan return np.nanmedian(x[mask]) values, mask, dtype, dtype_max = _get_values(values, skipna) if not is_float_dtype(values): values = values.astype('f8') values[mask] = np.nan if axis is None: values = values.ravel() notempty = values.size # an array from a frame if values.ndim > 1: # there's a non-empty array to apply over otherwise numpy raises if notempty: if not skipna: return _wrap_results( np.apply_along_axis(get_median, axis, values), dtype) # fastpath for the skipna case return _wrap_results(np.nanmedian(values, axis), dtype) # must return the correct shape, but median is not defined for the # empty set so return nans of shape "everything but the passed axis" # since "axis" is where the reduction would occur if we had a nonempty # array shp = np.array(values.shape) dims = np.arange(values.ndim) ret = np.empty(shp[dims != axis]) ret.fill(np.nan) return _wrap_results(ret, dtype) # otherwise return a scalar value return _wrap_results(get_median(values) if notempty else np.nan, dtype) def _get_counts_nanvar(mask, axis, ddof, dtype=float): dtype = _get_dtype(dtype) count = _get_counts(mask, axis, dtype=dtype) d = count - dtype.type(ddof) # always return NaN, never inf if is_scalar(count): if count <= ddof: count = np.nan d = np.nan else: mask2 = count <= ddof if mask2.any(): np.putmask(d, mask2, np.nan) np.putmask(count, mask2, np.nan) return count, d @disallow('M8') @bottleneck_switch(ddof=1) def nanstd(values, axis=None, skipna=True, ddof=1): result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof)) return _wrap_results(result, values.dtype) @disallow('M8') @bottleneck_switch(ddof=1) def nanvar(values, axis=None, skipna=True, ddof=1): values = com._values_from_object(values) dtype = values.dtype mask = isna(values) if is_any_int_dtype(values): values = values.astype('f8') values[mask] = np.nan if is_float_dtype(values): count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype) else: count, d = _get_counts_nanvar(mask, axis, ddof) if skipna: values = values.copy() np.putmask(values, mask, 0) # xref GH10242 # Compute variance via two-pass algorithm, which is stable against # cancellation errors and relatively accurate for small numbers of # observations. # # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count if axis is not None: avg = np.expand_dims(avg, axis) sqr = _ensure_numeric((avg - values)**2) np.putmask(sqr, mask, 0) result = sqr.sum(axis=axis, dtype=np.float64) / d # Return variance as np.float64 (the datatype used in the accumulator), # unless we were dealing with a float array, in which case use the same # precision as the original values array. if is_float_dtype(dtype): result = result.astype(dtype) return _wrap_results(result, values.dtype) @disallow('M8', 'm8') def nansem(values, axis=None, skipna=True, ddof=1): var = nanvar(values, axis, skipna, ddof=ddof) mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype) var = nanvar(values, axis, skipna, ddof=ddof) return np.sqrt(var) / np.sqrt(count) def _nanminmax(meth, fill_value_typ): @bottleneck_switch() def reduction(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values( values, skipna, fill_value_typ=fill_value_typ, ) if ((axis is not None and values.shape[axis] == 0) or values.size == 0): try: result = getattr(values, meth)(axis, dtype=dtype_max) result.fill(np.nan) except: result = np.nan else: result = getattr(values, meth)(axis) result = _wrap_results(result, dtype) return _maybe_null_out(result, axis, mask) reduction.__name__ = 'nan' + meth return reduction nanmin = _nanminmax('min', fill_value_typ='+inf') nanmax = _nanminmax('max', fill_value_typ='-inf') @disallow('O') def nanargmax(values, axis=None, skipna=True): """ Returns -1 in the NA case """ values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf') result = values.argmax(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result @disallow('O') def nanargmin(values, axis=None, skipna=True): """ Returns -1 in the NA case """ values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='+inf') result = values.argmin(axis) result = _maybe_arg_null_out(result, axis, mask, skipna) return result @disallow('M8', 'm8') def nanskew(values, axis=None, skipna=True): """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G1. The algorithm computes this coefficient directly from the second and third central moment. """ values = com._values_from_object(values) mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) else: count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna: np.putmask(adjusted, mask, 0) adjusted2 = adjusted ** 2 adjusted3 = adjusted2 * adjusted m2 = adjusted2.sum(axis, dtype=np.float64) m3 = adjusted3.sum(axis, dtype=np.float64) # floating point error # # #18044 in _libs/windows.pyx calc_skew follow this behavior # to fix the fperr to treat m2 <1e-14 as zero m2 = _zero_out_fperr(m2) m3 = _zero_out_fperr(m3) with np.errstate(invalid='ignore', divide='ignore'): result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5) dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(m2 == 0, 0, result) result[count < 3] = np.nan return result else: result = 0 if m2 == 0 else result if count < 3: return np.nan return result @disallow('M8', 'm8') def nankurt(values, axis=None, skipna=True): """ Compute the sample excess kurtosis. The statistic computed here is the adjusted Fisher-Pearson standardized moment coefficient G2, computed directly from the second and fourth central moment. """ values = com._values_from_object(values) mask = isna(values) if not is_float_dtype(values.dtype): values = values.astype('f8') count = _get_counts(mask, axis) else: count = _get_counts(mask, axis, dtype=values.dtype) if skipna: values = values.copy() np.putmask(values, mask, 0) mean = values.sum(axis, dtype=np.float64) / count if axis is not None: mean = np.expand_dims(mean, axis) adjusted = values - mean if skipna: np.putmask(adjusted, mask, 0) adjusted2 = adjusted ** 2 adjusted4 = adjusted2 ** 2 m2 = adjusted2.sum(axis, dtype=np.float64) m4 = adjusted4.sum(axis, dtype=np.float64) with np.errstate(invalid='ignore', divide='ignore'): adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) numer = count * (count + 1) * (count - 1) * m4 denom = (count - 2) * (count - 3) * m2**2 result = numer / denom - adj # floating point error # # #18044 in _libs/windows.pyx calc_kurt follow this behavior # to fix the fperr to treat denom <1e-14 as zero numer = _zero_out_fperr(numer) denom = _zero_out_fperr(denom) if not isinstance(denom, np.ndarray): # if ``denom`` is a scalar, check these corner cases first before # doing division if count < 4: return np.nan if denom == 0: return 0 with np.errstate(invalid='ignore', divide='ignore'): result = numer / denom - adj dtype = values.dtype if is_float_dtype(dtype): result = result.astype(dtype) if isinstance(result, np.ndarray): result = np.where(denom == 0, 0, result) result[count < 4] = np.nan return result @disallow('M8', 'm8') def nanprod(values, axis=None, skipna=True, min_count=0): mask = isna(values) if skipna and not is_any_int_dtype(values): values = values.copy() values[mask] = 1 result = values.prod(axis) return _maybe_null_out(result, axis, mask, min_count=min_count) def _maybe_arg_null_out(result, axis, mask, skipna): # helper function for nanargmin/nanargmax if axis is None or not getattr(result, 'ndim', False): if skipna: if mask.all(): result = -1 else: if mask.any(): result = -1 else: if skipna: na_mask = mask.all(axis) else: na_mask = mask.any(axis) if na_mask.any(): result[na_mask] = -1 return result def _get_counts(mask, axis, dtype=float): dtype = _get_dtype(dtype) if axis is None: return dtype.type(mask.size - mask.sum()) count = mask.shape[axis] - mask.sum(axis) if is_scalar(count): return dtype.type(count) try: return count.astype(dtype) except AttributeError: return np.array(count, dtype=dtype) def _maybe_null_out(result, axis, mask, min_count=1): if axis is not None and getattr(result, 'ndim', False): null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0 if np.any(null_mask): if is_numeric_dtype(result): if np.iscomplexobj(result): result = result.astype('c16') else: result = result.astype('f8') result[null_mask] = np.nan else: # GH12941, use None to auto cast null result[null_mask] = None elif result is not tslib.NaT: null_mask = mask.size - mask.sum() if null_mask < min_count: result = np.nan return result def _zero_out_fperr(arg): # #18044 reference this behavior to fix rolling skew/kurt issue if isinstance(arg, np.ndarray): with np.errstate(invalid='ignore'): return np.where(np.abs(arg) < 1e-14, 0, arg) else: return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg @disallow('M8', 'm8') def nancorr(a, b, method='pearson', min_periods=None): """ a, b: ndarrays """ if len(a) != len(b): raise AssertionError('Operands to nancorr must have same size') if min_periods is None: min_periods = 1 valid = notna(a) & notna(b) if not valid.all(): a = a[valid] b = b[valid] if len(a) < min_periods: return np.nan f = get_corr_func(method) return f(a, b) def get_corr_func(method): if method in ['kendall', 'spearman']: from scipy.stats import kendalltau, spearmanr def _pearson(a, b): return np.corrcoef(a, b)[0, 1] def _kendall(a, b): rs = kendalltau(a, b) if isinstance(rs, tuple): return rs[0] return rs def _spearman(a, b): return spearmanr(a, b)[0] _cor_methods = { 'pearson': _pearson, 'kendall': _kendall, 'spearman': _spearman } return _cor_methods[method] @disallow('M8', 'm8') def nancov(a, b, min_periods=None): if len(a) != len(b): raise AssertionError('Operands to nancov must have same size') if min_periods is None: min_periods = 1 valid = notna(a) & notna(b) if not valid.all(): a = a[valid] b = b[valid] if len(a) < min_periods: return np.nan return np.cov(a, b)[0, 1] def _ensure_numeric(x): if isinstance(x, np.ndarray): if is_integer_dtype(x) or is_bool_dtype(x): x = x.astype(np.float64) elif is_object_dtype(x): try: x = x.astype(np.complex128) except: x = x.astype(np.float64) else: if not np.any(x.imag): x = x.real elif not (is_float(x) or is_integer(x) or is_complex(x)): try: x = float(x) except Exception: try: x = complex(x) except Exception: raise TypeError('Could not convert {value!s} to numeric' .format(value=x)) return x # NA-friendly array comparisons def make_nancomp(op): def f(x, y): xmask = isna(x) ymask = isna(y) mask = xmask | ymask with np.errstate(all='ignore'): result = op(x, y) if mask.any(): if is_bool_dtype(result): result = result.astype('O') np.putmask(result, mask, np.nan) return result return f nangt = make_nancomp(operator.gt) nange = make_nancomp(operator.ge) nanlt = make_nancomp(operator.lt) nanle = make_nancomp(operator.le) naneq = make_nancomp(operator.eq) nanne = make_nancomp(operator.ne)