laywerrobot/lib/python3.6/site-packages/pandas/core/nanops.py
2020-08-27 21:55:39 +02:00

858 lines
25 KiB
Python

import itertools
import functools
import operator
import warnings
from distutils.version import LooseVersion
import numpy as np
from pandas import compat
from pandas._libs import tslib, lib
from pandas.core.dtypes.common import (
_get_dtype,
is_float, is_scalar,
is_integer, is_complex, is_float_dtype,
is_complex_dtype, is_integer_dtype,
is_bool_dtype, is_object_dtype,
is_numeric_dtype,
is_datetime64_dtype, is_timedelta64_dtype,
is_datetime_or_timedelta_dtype,
is_int_or_datetime_dtype, is_any_int_dtype)
from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask
from pandas.core.dtypes.missing import isna, notna, na_value_for_dtype
from pandas.core.config import get_option
import pandas.core.common as com
_BOTTLENECK_INSTALLED = False
_MIN_BOTTLENECK_VERSION = '1.0.0'
try:
import bottleneck as bn
ver = bn.__version__
_BOTTLENECK_INSTALLED = (LooseVersion(ver) >=
LooseVersion(_MIN_BOTTLENECK_VERSION))
if not _BOTTLENECK_INSTALLED:
warnings.warn(
"The installed version of bottleneck {ver} is not supported "
"in pandas and will be not be used\nThe minimum supported "
"version is {min_ver}\n".format(
ver=ver, min_ver=_MIN_BOTTLENECK_VERSION), UserWarning)
except ImportError: # pragma: no cover
pass
_USE_BOTTLENECK = False
def set_use_bottleneck(v=True):
# set/unset to use bottleneck
global _USE_BOTTLENECK
if _BOTTLENECK_INSTALLED:
_USE_BOTTLENECK = v
set_use_bottleneck(get_option('compute.use_bottleneck'))
class disallow(object):
def __init__(self, *dtypes):
super(disallow, self).__init__()
self.dtypes = tuple(np.dtype(dtype).type for dtype in dtypes)
def check(self, obj):
return hasattr(obj, 'dtype') and issubclass(obj.dtype.type,
self.dtypes)
def __call__(self, f):
@functools.wraps(f)
def _f(*args, **kwargs):
obj_iter = itertools.chain(args, compat.itervalues(kwargs))
if any(self.check(obj) for obj in obj_iter):
msg = 'reduction operation {name!r} not allowed for this dtype'
raise TypeError(msg.format(name=f.__name__.replace('nan', '')))
try:
with np.errstate(invalid='ignore'):
return f(*args, **kwargs)
except ValueError as e:
# we want to transform an object array
# ValueError message to the more typical TypeError
# e.g. this is normally a disallowed function on
# object arrays that contain strings
if is_object_dtype(args[0]):
raise TypeError(e)
raise
return _f
class bottleneck_switch(object):
def __init__(self, **kwargs):
self.kwargs = kwargs
def __call__(self, alt):
bn_name = alt.__name__
try:
bn_func = getattr(bn, bn_name)
except (AttributeError, NameError): # pragma: no cover
bn_func = None
@functools.wraps(alt)
def f(values, axis=None, skipna=True, **kwds):
if len(self.kwargs) > 0:
for k, v in compat.iteritems(self.kwargs):
if k not in kwds:
kwds[k] = v
try:
if values.size == 0 and kwds.get('min_count') is None:
# We are empty, returning NA for our type
# Only applies for the default `min_count` of None
# since that affects how empty arrays are handled.
# TODO(GH-18976) update all the nanops methods to
# correctly handle empty inputs and remove this check.
# It *may* just be `var`
return _na_for_min_count(values, axis)
if (_USE_BOTTLENECK and skipna and
_bn_ok_dtype(values.dtype, bn_name)):
result = bn_func(values, axis=axis, **kwds)
# prefer to treat inf/-inf as NA, but must compute the func
# twice :(
if _has_infs(result):
result = alt(values, axis=axis, skipna=skipna, **kwds)
else:
result = alt(values, axis=axis, skipna=skipna, **kwds)
except Exception:
try:
result = alt(values, axis=axis, skipna=skipna, **kwds)
except ValueError as e:
# we want to transform an object array
# ValueError message to the more typical TypeError
# e.g. this is normally a disallowed function on
# object arrays that contain strings
if is_object_dtype(values):
raise TypeError(e)
raise
return result
return f
def _bn_ok_dtype(dt, name):
# Bottleneck chokes on datetime64
if (not is_object_dtype(dt) and not is_datetime_or_timedelta_dtype(dt)):
# GH 15507
# bottleneck does not properly upcast during the sum
# so can overflow
# GH 9422
# further we also want to preserve NaN when all elements
# are NaN, unlinke bottleneck/numpy which consider this
# to be 0
if name in ['nansum', 'nanprod']:
return False
return True
return False
def _has_infs(result):
if isinstance(result, np.ndarray):
if result.dtype == 'f8':
return lib.has_infs_f8(result.ravel())
elif result.dtype == 'f4':
return lib.has_infs_f4(result.ravel())
try:
return np.isinf(result).any()
except (TypeError, NotImplementedError):
# if it doesn't support infs, then it can't have infs
return False
def _get_fill_value(dtype, fill_value=None, fill_value_typ=None):
""" return the correct fill value for the dtype of the values """
if fill_value is not None:
return fill_value
if _na_ok_dtype(dtype):
if fill_value_typ is None:
return np.nan
else:
if fill_value_typ == '+inf':
return np.inf
else:
return -np.inf
else:
if fill_value_typ is None:
return tslib.iNaT
else:
if fill_value_typ == '+inf':
# need the max int here
return _int64_max
else:
return tslib.iNaT
def _get_values(values, skipna, fill_value=None, fill_value_typ=None,
isfinite=False, copy=True):
""" utility to get the values view, mask, dtype
if necessary copy and mask using the specified fill_value
copy = True will force the copy
"""
values = com._values_from_object(values)
if isfinite:
mask = _isfinite(values)
else:
mask = isna(values)
dtype = values.dtype
dtype_ok = _na_ok_dtype(dtype)
# get our fill value (in case we need to provide an alternative
# dtype for it)
fill_value = _get_fill_value(dtype, fill_value=fill_value,
fill_value_typ=fill_value_typ)
if skipna:
if copy:
values = values.copy()
if dtype_ok:
np.putmask(values, mask, fill_value)
# promote if needed
else:
values, changed = maybe_upcast_putmask(values, mask, fill_value)
elif copy:
values = values.copy()
values = _view_if_needed(values)
# return a platform independent precision dtype
dtype_max = dtype
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
dtype_max = np.int64
elif is_float_dtype(dtype):
dtype_max = np.float64
return values, mask, dtype, dtype_max
def _isfinite(values):
if is_datetime_or_timedelta_dtype(values):
return isna(values)
if (is_complex_dtype(values) or is_float_dtype(values) or
is_integer_dtype(values) or is_bool_dtype(values)):
return ~np.isfinite(values)
return ~np.isfinite(values.astype('float64'))
def _na_ok_dtype(dtype):
return not is_int_or_datetime_dtype(dtype)
def _view_if_needed(values):
if is_datetime_or_timedelta_dtype(values):
return values.view(np.int64)
return values
def _wrap_results(result, dtype):
""" wrap our results if needed """
if is_datetime64_dtype(dtype):
if not isinstance(result, np.ndarray):
result = tslib.Timestamp(result)
else:
result = result.view(dtype)
elif is_timedelta64_dtype(dtype):
if not isinstance(result, np.ndarray):
# raise if we have a timedelta64[ns] which is too large
if np.fabs(result) > _int64_max:
raise ValueError("overflow in timedelta operation")
result = tslib.Timedelta(result, unit='ns')
else:
result = result.astype('i8').view(dtype)
return result
def _na_for_min_count(values, axis):
"""Return the missing value for `values`
Parameters
----------
values : ndarray
axis : int or None
axis for the reduction
Returns
-------
result : scalar or ndarray
For 1-D values, returns a scalar of the correct missing type.
For 2-D values, returns a 1-D array where each element is missing.
"""
# we either return np.nan or pd.NaT
if is_numeric_dtype(values):
values = values.astype('float64')
fill_value = na_value_for_dtype(values.dtype)
if values.ndim == 1:
return fill_value
else:
result_shape = (values.shape[:axis] +
values.shape[axis + 1:])
result = np.empty(result_shape, dtype=values.dtype)
result.fill(fill_value)
return result
def nanany(values, axis=None, skipna=True):
values, mask, dtype, _ = _get_values(values, skipna, False, copy=skipna)
return values.any(axis)
def nanall(values, axis=None, skipna=True):
values, mask, dtype, _ = _get_values(values, skipna, True, copy=skipna)
return values.all(axis)
@disallow('M8')
def nansum(values, axis=None, skipna=True, min_count=0):
values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
dtype_sum = dtype_max
if is_float_dtype(dtype):
dtype_sum = dtype
elif is_timedelta64_dtype(dtype):
dtype_sum = np.float64
the_sum = values.sum(axis, dtype=dtype_sum)
the_sum = _maybe_null_out(the_sum, axis, mask, min_count=min_count)
return _wrap_results(the_sum, dtype)
@disallow('M8')
@bottleneck_switch()
def nanmean(values, axis=None, skipna=True):
values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
dtype_sum = dtype_max
dtype_count = np.float64
if is_integer_dtype(dtype) or is_timedelta64_dtype(dtype):
dtype_sum = np.float64
elif is_float_dtype(dtype):
dtype_sum = dtype
dtype_count = dtype
count = _get_counts(mask, axis, dtype=dtype_count)
the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
if axis is not None and getattr(the_sum, 'ndim', False):
the_mean = the_sum / count
ct_mask = count == 0
if ct_mask.any():
the_mean[ct_mask] = np.nan
else:
the_mean = the_sum / count if count > 0 else np.nan
return _wrap_results(the_mean, dtype)
@disallow('M8')
@bottleneck_switch()
def nanmedian(values, axis=None, skipna=True):
def get_median(x):
mask = notna(x)
if not skipna and not mask.all():
return np.nan
return np.nanmedian(x[mask])
values, mask, dtype, dtype_max = _get_values(values, skipna)
if not is_float_dtype(values):
values = values.astype('f8')
values[mask] = np.nan
if axis is None:
values = values.ravel()
notempty = values.size
# an array from a frame
if values.ndim > 1:
# there's a non-empty array to apply over otherwise numpy raises
if notempty:
if not skipna:
return _wrap_results(
np.apply_along_axis(get_median, axis, values), dtype)
# fastpath for the skipna case
return _wrap_results(np.nanmedian(values, axis), dtype)
# must return the correct shape, but median is not defined for the
# empty set so return nans of shape "everything but the passed axis"
# since "axis" is where the reduction would occur if we had a nonempty
# array
shp = np.array(values.shape)
dims = np.arange(values.ndim)
ret = np.empty(shp[dims != axis])
ret.fill(np.nan)
return _wrap_results(ret, dtype)
# otherwise return a scalar value
return _wrap_results(get_median(values) if notempty else np.nan, dtype)
def _get_counts_nanvar(mask, axis, ddof, dtype=float):
dtype = _get_dtype(dtype)
count = _get_counts(mask, axis, dtype=dtype)
d = count - dtype.type(ddof)
# always return NaN, never inf
if is_scalar(count):
if count <= ddof:
count = np.nan
d = np.nan
else:
mask2 = count <= ddof
if mask2.any():
np.putmask(d, mask2, np.nan)
np.putmask(count, mask2, np.nan)
return count, d
@disallow('M8')
@bottleneck_switch(ddof=1)
def nanstd(values, axis=None, skipna=True, ddof=1):
result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof))
return _wrap_results(result, values.dtype)
@disallow('M8')
@bottleneck_switch(ddof=1)
def nanvar(values, axis=None, skipna=True, ddof=1):
values = com._values_from_object(values)
dtype = values.dtype
mask = isna(values)
if is_any_int_dtype(values):
values = values.astype('f8')
values[mask] = np.nan
if is_float_dtype(values):
count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype)
else:
count, d = _get_counts_nanvar(mask, axis, ddof)
if skipna:
values = values.copy()
np.putmask(values, mask, 0)
# xref GH10242
# Compute variance via two-pass algorithm, which is stable against
# cancellation errors and relatively accurate for small numbers of
# observations.
#
# See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
if axis is not None:
avg = np.expand_dims(avg, axis)
sqr = _ensure_numeric((avg - values)**2)
np.putmask(sqr, mask, 0)
result = sqr.sum(axis=axis, dtype=np.float64) / d
# Return variance as np.float64 (the datatype used in the accumulator),
# unless we were dealing with a float array, in which case use the same
# precision as the original values array.
if is_float_dtype(dtype):
result = result.astype(dtype)
return _wrap_results(result, values.dtype)
@disallow('M8', 'm8')
def nansem(values, axis=None, skipna=True, ddof=1):
var = nanvar(values, axis, skipna, ddof=ddof)
mask = isna(values)
if not is_float_dtype(values.dtype):
values = values.astype('f8')
count, _ = _get_counts_nanvar(mask, axis, ddof, values.dtype)
var = nanvar(values, axis, skipna, ddof=ddof)
return np.sqrt(var) / np.sqrt(count)
def _nanminmax(meth, fill_value_typ):
@bottleneck_switch()
def reduction(values, axis=None, skipna=True):
values, mask, dtype, dtype_max = _get_values(
values, skipna, fill_value_typ=fill_value_typ, )
if ((axis is not None and values.shape[axis] == 0) or
values.size == 0):
try:
result = getattr(values, meth)(axis, dtype=dtype_max)
result.fill(np.nan)
except:
result = np.nan
else:
result = getattr(values, meth)(axis)
result = _wrap_results(result, dtype)
return _maybe_null_out(result, axis, mask)
reduction.__name__ = 'nan' + meth
return reduction
nanmin = _nanminmax('min', fill_value_typ='+inf')
nanmax = _nanminmax('max', fill_value_typ='-inf')
@disallow('O')
def nanargmax(values, axis=None, skipna=True):
"""
Returns -1 in the NA case
"""
values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='-inf')
result = values.argmax(axis)
result = _maybe_arg_null_out(result, axis, mask, skipna)
return result
@disallow('O')
def nanargmin(values, axis=None, skipna=True):
"""
Returns -1 in the NA case
"""
values, mask, dtype, _ = _get_values(values, skipna, fill_value_typ='+inf')
result = values.argmin(axis)
result = _maybe_arg_null_out(result, axis, mask, skipna)
return result
@disallow('M8', 'm8')
def nanskew(values, axis=None, skipna=True):
""" Compute the sample skewness.
The statistic computed here is the adjusted Fisher-Pearson standardized
moment coefficient G1. The algorithm computes this coefficient directly
from the second and third central moment.
"""
values = com._values_from_object(values)
mask = isna(values)
if not is_float_dtype(values.dtype):
values = values.astype('f8')
count = _get_counts(mask, axis)
else:
count = _get_counts(mask, axis, dtype=values.dtype)
if skipna:
values = values.copy()
np.putmask(values, mask, 0)
mean = values.sum(axis, dtype=np.float64) / count
if axis is not None:
mean = np.expand_dims(mean, axis)
adjusted = values - mean
if skipna:
np.putmask(adjusted, mask, 0)
adjusted2 = adjusted ** 2
adjusted3 = adjusted2 * adjusted
m2 = adjusted2.sum(axis, dtype=np.float64)
m3 = adjusted3.sum(axis, dtype=np.float64)
# floating point error
#
# #18044 in _libs/windows.pyx calc_skew follow this behavior
# to fix the fperr to treat m2 <1e-14 as zero
m2 = _zero_out_fperr(m2)
m3 = _zero_out_fperr(m3)
with np.errstate(invalid='ignore', divide='ignore'):
result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5)
dtype = values.dtype
if is_float_dtype(dtype):
result = result.astype(dtype)
if isinstance(result, np.ndarray):
result = np.where(m2 == 0, 0, result)
result[count < 3] = np.nan
return result
else:
result = 0 if m2 == 0 else result
if count < 3:
return np.nan
return result
@disallow('M8', 'm8')
def nankurt(values, axis=None, skipna=True):
""" Compute the sample excess kurtosis.
The statistic computed here is the adjusted Fisher-Pearson standardized
moment coefficient G2, computed directly from the second and fourth
central moment.
"""
values = com._values_from_object(values)
mask = isna(values)
if not is_float_dtype(values.dtype):
values = values.astype('f8')
count = _get_counts(mask, axis)
else:
count = _get_counts(mask, axis, dtype=values.dtype)
if skipna:
values = values.copy()
np.putmask(values, mask, 0)
mean = values.sum(axis, dtype=np.float64) / count
if axis is not None:
mean = np.expand_dims(mean, axis)
adjusted = values - mean
if skipna:
np.putmask(adjusted, mask, 0)
adjusted2 = adjusted ** 2
adjusted4 = adjusted2 ** 2
m2 = adjusted2.sum(axis, dtype=np.float64)
m4 = adjusted4.sum(axis, dtype=np.float64)
with np.errstate(invalid='ignore', divide='ignore'):
adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
numer = count * (count + 1) * (count - 1) * m4
denom = (count - 2) * (count - 3) * m2**2
result = numer / denom - adj
# floating point error
#
# #18044 in _libs/windows.pyx calc_kurt follow this behavior
# to fix the fperr to treat denom <1e-14 as zero
numer = _zero_out_fperr(numer)
denom = _zero_out_fperr(denom)
if not isinstance(denom, np.ndarray):
# if ``denom`` is a scalar, check these corner cases first before
# doing division
if count < 4:
return np.nan
if denom == 0:
return 0
with np.errstate(invalid='ignore', divide='ignore'):
result = numer / denom - adj
dtype = values.dtype
if is_float_dtype(dtype):
result = result.astype(dtype)
if isinstance(result, np.ndarray):
result = np.where(denom == 0, 0, result)
result[count < 4] = np.nan
return result
@disallow('M8', 'm8')
def nanprod(values, axis=None, skipna=True, min_count=0):
mask = isna(values)
if skipna and not is_any_int_dtype(values):
values = values.copy()
values[mask] = 1
result = values.prod(axis)
return _maybe_null_out(result, axis, mask, min_count=min_count)
def _maybe_arg_null_out(result, axis, mask, skipna):
# helper function for nanargmin/nanargmax
if axis is None or not getattr(result, 'ndim', False):
if skipna:
if mask.all():
result = -1
else:
if mask.any():
result = -1
else:
if skipna:
na_mask = mask.all(axis)
else:
na_mask = mask.any(axis)
if na_mask.any():
result[na_mask] = -1
return result
def _get_counts(mask, axis, dtype=float):
dtype = _get_dtype(dtype)
if axis is None:
return dtype.type(mask.size - mask.sum())
count = mask.shape[axis] - mask.sum(axis)
if is_scalar(count):
return dtype.type(count)
try:
return count.astype(dtype)
except AttributeError:
return np.array(count, dtype=dtype)
def _maybe_null_out(result, axis, mask, min_count=1):
if axis is not None and getattr(result, 'ndim', False):
null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0
if np.any(null_mask):
if is_numeric_dtype(result):
if np.iscomplexobj(result):
result = result.astype('c16')
else:
result = result.astype('f8')
result[null_mask] = np.nan
else:
# GH12941, use None to auto cast null
result[null_mask] = None
elif result is not tslib.NaT:
null_mask = mask.size - mask.sum()
if null_mask < min_count:
result = np.nan
return result
def _zero_out_fperr(arg):
# #18044 reference this behavior to fix rolling skew/kurt issue
if isinstance(arg, np.ndarray):
with np.errstate(invalid='ignore'):
return np.where(np.abs(arg) < 1e-14, 0, arg)
else:
return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
@disallow('M8', 'm8')
def nancorr(a, b, method='pearson', min_periods=None):
"""
a, b: ndarrays
"""
if len(a) != len(b):
raise AssertionError('Operands to nancorr must have same size')
if min_periods is None:
min_periods = 1
valid = notna(a) & notna(b)
if not valid.all():
a = a[valid]
b = b[valid]
if len(a) < min_periods:
return np.nan
f = get_corr_func(method)
return f(a, b)
def get_corr_func(method):
if method in ['kendall', 'spearman']:
from scipy.stats import kendalltau, spearmanr
def _pearson(a, b):
return np.corrcoef(a, b)[0, 1]
def _kendall(a, b):
rs = kendalltau(a, b)
if isinstance(rs, tuple):
return rs[0]
return rs
def _spearman(a, b):
return spearmanr(a, b)[0]
_cor_methods = {
'pearson': _pearson,
'kendall': _kendall,
'spearman': _spearman
}
return _cor_methods[method]
@disallow('M8', 'm8')
def nancov(a, b, min_periods=None):
if len(a) != len(b):
raise AssertionError('Operands to nancov must have same size')
if min_periods is None:
min_periods = 1
valid = notna(a) & notna(b)
if not valid.all():
a = a[valid]
b = b[valid]
if len(a) < min_periods:
return np.nan
return np.cov(a, b)[0, 1]
def _ensure_numeric(x):
if isinstance(x, np.ndarray):
if is_integer_dtype(x) or is_bool_dtype(x):
x = x.astype(np.float64)
elif is_object_dtype(x):
try:
x = x.astype(np.complex128)
except:
x = x.astype(np.float64)
else:
if not np.any(x.imag):
x = x.real
elif not (is_float(x) or is_integer(x) or is_complex(x)):
try:
x = float(x)
except Exception:
try:
x = complex(x)
except Exception:
raise TypeError('Could not convert {value!s} to numeric'
.format(value=x))
return x
# NA-friendly array comparisons
def make_nancomp(op):
def f(x, y):
xmask = isna(x)
ymask = isna(y)
mask = xmask | ymask
with np.errstate(all='ignore'):
result = op(x, y)
if mask.any():
if is_bool_dtype(result):
result = result.astype('O')
np.putmask(result, mask, np.nan)
return result
return f
nangt = make_nancomp(operator.gt)
nange = make_nancomp(operator.ge)
nanlt = make_nancomp(operator.lt)
nanle = make_nancomp(operator.le)
naneq = make_nancomp(operator.eq)
nanne = make_nancomp(operator.ne)