laywerrobot/lib/python3.6/site-packages/pandas/tests/test_window.py

3907 lines
146 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
from itertools import product
import pytest
import warnings
from warnings import catch_warnings
from datetime import datetime, timedelta
from numpy.random import randn
import numpy as np
from pandas import _np_version_under1p12
import pandas as pd
from pandas import (Series, DataFrame, bdate_range,
isna, notna, concat, Timestamp, Index)
import pandas.core.window as rwindow
import pandas.tseries.offsets as offsets
from pandas.core.base import SpecificationError
from pandas.errors import UnsupportedFunctionCall
from pandas.core.sorting import safe_sort
import pandas.util.testing as tm
import pandas.util._test_decorators as td
from pandas.compat import range, zip
N, K = 100, 10
def assert_equal(left, right):
if isinstance(left, Series):
tm.assert_series_equal(left, right)
else:
tm.assert_frame_equal(left, right)
@pytest.fixture(params=[True, False])
def raw(request):
return request.param
@pytest.fixture(params=['triang', 'blackman', 'hamming', 'bartlett', 'bohman',
'blackmanharris', 'nuttall', 'barthann'])
def win_types(request):
return request.param
@pytest.fixture(params=['kaiser', 'gaussian', 'general_gaussian'])
def win_types_special(request):
return request.param
class Base(object):
_nan_locs = np.arange(20, 40)
_inf_locs = np.array([])
def _create_data(self):
arr = randn(N)
arr[self._nan_locs] = np.NaN
self.arr = arr
self.rng = bdate_range(datetime(2009, 1, 1), periods=N)
self.series = Series(arr.copy(), index=self.rng)
self.frame = DataFrame(randn(N, K), index=self.rng,
columns=np.arange(K))
class TestApi(Base):
def setup_method(self, method):
self._create_data()
def test_getitem(self):
r = self.frame.rolling(window=5)
tm.assert_index_equal(r._selected_obj.columns, self.frame.columns)
r = self.frame.rolling(window=5)[1]
assert r._selected_obj.name == self.frame.columns[1]
# technically this is allowed
r = self.frame.rolling(window=5)[1, 3]
tm.assert_index_equal(r._selected_obj.columns,
self.frame.columns[[1, 3]])
r = self.frame.rolling(window=5)[[1, 3]]
tm.assert_index_equal(r._selected_obj.columns,
self.frame.columns[[1, 3]])
def test_select_bad_cols(self):
df = DataFrame([[1, 2]], columns=['A', 'B'])
g = df.rolling(window=5)
pytest.raises(KeyError, g.__getitem__, ['C']) # g[['C']]
pytest.raises(KeyError, g.__getitem__, ['A', 'C']) # g[['A', 'C']]
with tm.assert_raises_regex(KeyError, '^[^A]+$'):
# A should not be referenced as a bad column...
# will have to rethink regex if you change message!
g[['A', 'C']]
def test_attribute_access(self):
df = DataFrame([[1, 2]], columns=['A', 'B'])
r = df.rolling(window=5)
tm.assert_series_equal(r.A.sum(), r['A'].sum())
pytest.raises(AttributeError, lambda: r.F)
def tests_skip_nuisance(self):
df = DataFrame({'A': range(5), 'B': range(5, 10), 'C': 'foo'})
r = df.rolling(window=3)
result = r[['A', 'B']].sum()
expected = DataFrame({'A': [np.nan, np.nan, 3, 6, 9],
'B': [np.nan, np.nan, 18, 21, 24]},
columns=list('AB'))
tm.assert_frame_equal(result, expected)
def test_skip_sum_object_raises(self):
df = DataFrame({'A': range(5), 'B': range(5, 10), 'C': 'foo'})
r = df.rolling(window=3)
with tm.assert_raises_regex(TypeError, 'cannot handle this type'):
r.sum()
def test_agg(self):
df = DataFrame({'A': range(5), 'B': range(0, 10, 2)})
r = df.rolling(window=3)
a_mean = r['A'].mean()
a_std = r['A'].std()
a_sum = r['A'].sum()
b_mean = r['B'].mean()
b_std = r['B'].std()
b_sum = r['B'].sum()
result = r.aggregate([np.mean, np.std])
expected = concat([a_mean, a_std, b_mean, b_std], axis=1)
expected.columns = pd.MultiIndex.from_product([['A', 'B'], ['mean',
'std']])
tm.assert_frame_equal(result, expected)
result = r.aggregate({'A': np.mean, 'B': np.std})
expected = concat([a_mean, b_std], axis=1)
tm.assert_frame_equal(result, expected, check_like=True)
result = r.aggregate({'A': ['mean', 'std']})
expected = concat([a_mean, a_std], axis=1)
expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'), ('A',
'std')])
tm.assert_frame_equal(result, expected)
result = r['A'].aggregate(['mean', 'sum'])
expected = concat([a_mean, a_sum], axis=1)
expected.columns = ['mean', 'sum']
tm.assert_frame_equal(result, expected)
with catch_warnings(record=True):
result = r.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}})
expected = concat([a_mean, a_sum], axis=1)
expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
('A', 'sum')])
tm.assert_frame_equal(result, expected, check_like=True)
with catch_warnings(record=True):
result = r.aggregate({'A': {'mean': 'mean',
'sum': 'sum'},
'B': {'mean2': 'mean',
'sum2': 'sum'}})
expected = concat([a_mean, a_sum, b_mean, b_sum], axis=1)
exp_cols = [('A', 'mean'), ('A', 'sum'), ('B', 'mean2'), ('B', 'sum2')]
expected.columns = pd.MultiIndex.from_tuples(exp_cols)
tm.assert_frame_equal(result, expected, check_like=True)
result = r.aggregate({'A': ['mean', 'std'], 'B': ['mean', 'std']})
expected = concat([a_mean, a_std, b_mean, b_std], axis=1)
exp_cols = [('A', 'mean'), ('A', 'std'), ('B', 'mean'), ('B', 'std')]
expected.columns = pd.MultiIndex.from_tuples(exp_cols)
tm.assert_frame_equal(result, expected, check_like=True)
def test_agg_apply(self, raw):
# passed lambda
df = DataFrame({'A': range(5), 'B': range(0, 10, 2)})
r = df.rolling(window=3)
a_sum = r['A'].sum()
result = r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)})
rcustom = r['B'].apply(lambda x: np.std(x, ddof=1), raw=raw)
expected = concat([a_sum, rcustom], axis=1)
tm.assert_frame_equal(result, expected, check_like=True)
def test_agg_consistency(self):
df = DataFrame({'A': range(5), 'B': range(0, 10, 2)})
r = df.rolling(window=3)
result = r.agg([np.sum, np.mean]).columns
expected = pd.MultiIndex.from_product([list('AB'), ['sum', 'mean']])
tm.assert_index_equal(result, expected)
result = r['A'].agg([np.sum, np.mean]).columns
expected = Index(['sum', 'mean'])
tm.assert_index_equal(result, expected)
result = r.agg({'A': [np.sum, np.mean]}).columns
expected = pd.MultiIndex.from_tuples([('A', 'sum'), ('A', 'mean')])
tm.assert_index_equal(result, expected)
def test_agg_nested_dicts(self):
# API change for disallowing these types of nested dicts
df = DataFrame({'A': range(5), 'B': range(0, 10, 2)})
r = df.rolling(window=3)
def f():
r.aggregate({'r1': {'A': ['mean', 'sum']},
'r2': {'B': ['mean', 'sum']}})
pytest.raises(SpecificationError, f)
expected = concat([r['A'].mean(), r['A'].std(),
r['B'].mean(), r['B'].std()], axis=1)
expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), (
'ra', 'std'), ('rb', 'mean'), ('rb', 'std')])
with catch_warnings(record=True):
result = r[['A', 'B']].agg({'A': {'ra': ['mean', 'std']},
'B': {'rb': ['mean', 'std']}})
tm.assert_frame_equal(result, expected, check_like=True)
with catch_warnings(record=True):
result = r.agg({'A': {'ra': ['mean', 'std']},
'B': {'rb': ['mean', 'std']}})
expected.columns = pd.MultiIndex.from_tuples([('A', 'ra', 'mean'), (
'A', 'ra', 'std'), ('B', 'rb', 'mean'), ('B', 'rb', 'std')])
tm.assert_frame_equal(result, expected, check_like=True)
def test_count_nonnumeric_types(self):
# GH12541
cols = ['int', 'float', 'string', 'datetime', 'timedelta', 'periods',
'fl_inf', 'fl_nan', 'str_nan', 'dt_nat', 'periods_nat']
df = DataFrame(
{'int': [1, 2, 3],
'float': [4., 5., 6.],
'string': list('abc'),
'datetime': pd.date_range('20170101', periods=3),
'timedelta': pd.timedelta_range('1 s', periods=3, freq='s'),
'periods': [pd.Period('2012-01'), pd.Period('2012-02'),
pd.Period('2012-03')],
'fl_inf': [1., 2., np.Inf],
'fl_nan': [1., 2., np.NaN],
'str_nan': ['aa', 'bb', np.NaN],
'dt_nat': [Timestamp('20170101'), Timestamp('20170203'),
Timestamp(None)],
'periods_nat': [pd.Period('2012-01'), pd.Period('2012-02'),
pd.Period(None)]},
columns=cols)
expected = DataFrame(
{'int': [1., 2., 2.],
'float': [1., 2., 2.],
'string': [1., 2., 2.],
'datetime': [1., 2., 2.],
'timedelta': [1., 2., 2.],
'periods': [1., 2., 2.],
'fl_inf': [1., 2., 2.],
'fl_nan': [1., 2., 1.],
'str_nan': [1., 2., 1.],
'dt_nat': [1., 2., 1.],
'periods_nat': [1., 2., 1.]},
columns=cols)
result = df.rolling(window=2).count()
tm.assert_frame_equal(result, expected)
result = df.rolling(1).count()
expected = df.notna().astype(float)
tm.assert_frame_equal(result, expected)
@td.skip_if_no_scipy
def test_window_with_args(self):
# make sure that we are aggregating window functions correctly with arg
r = Series(np.random.randn(100)).rolling(window=10, min_periods=1,
win_type='gaussian')
expected = concat([r.mean(std=10), r.mean(std=.01)], axis=1)
expected.columns = ['<lambda>', '<lambda>']
result = r.aggregate([lambda x: x.mean(std=10),
lambda x: x.mean(std=.01)])
tm.assert_frame_equal(result, expected)
def a(x):
return x.mean(std=10)
def b(x):
return x.mean(std=0.01)
expected = concat([r.mean(std=10), r.mean(std=.01)], axis=1)
expected.columns = ['a', 'b']
result = r.aggregate([a, b])
tm.assert_frame_equal(result, expected)
def test_preserve_metadata(self):
# GH 10565
s = Series(np.arange(100), name='foo')
s2 = s.rolling(30).sum()
s3 = s.rolling(20).sum()
assert s2.name == 'foo'
assert s3.name == 'foo'
class TestWindow(Base):
def setup_method(self, method):
self._create_data()
@td.skip_if_no_scipy
@pytest.mark.parametrize(
'which', ['series', 'frame'])
def test_constructor(self, which):
# GH 12669
o = getattr(self, which)
c = o.rolling
# valid
c(win_type='boxcar', window=2, min_periods=1)
c(win_type='boxcar', window=2, min_periods=1, center=True)
c(win_type='boxcar', window=2, min_periods=1, center=False)
# not valid
for w in [2., 'foo', np.array([2])]:
with pytest.raises(ValueError):
c(win_type='boxcar', window=2, min_periods=w)
with pytest.raises(ValueError):
c(win_type='boxcar', window=2, min_periods=1, center=w)
for wt in ['foobar', 1]:
with pytest.raises(ValueError):
c(win_type=wt, window=2)
@td.skip_if_no_scipy
@pytest.mark.parametrize(
'which', ['series', 'frame'])
def test_constructor_with_win_type(self, which, win_types):
# GH 12669
o = getattr(self, which)
c = o.rolling
c(win_type=win_types, window=2)
@pytest.mark.parametrize(
'method', ['sum', 'mean'])
def test_numpy_compat(self, method):
# see gh-12811
w = rwindow.Window(Series([2, 4, 6]), window=[0, 2])
msg = "numpy operations are not valid with window objects"
tm.assert_raises_regex(UnsupportedFunctionCall, msg,
getattr(w, method), 1, 2, 3)
tm.assert_raises_regex(UnsupportedFunctionCall, msg,
getattr(w, method), dtype=np.float64)
class TestRolling(Base):
def setup_method(self, method):
self._create_data()
def test_doc_string(self):
df = DataFrame({'B': [0, 1, 2, np.nan, 4]})
df
df.rolling(2).sum()
df.rolling(2, min_periods=1).sum()
@pytest.mark.parametrize(
'which', ['series', 'frame'])
def test_constructor(self, which):
# GH 12669
o = getattr(self, which)
c = o.rolling
# valid
c(window=2)
c(window=2, min_periods=1)
c(window=2, min_periods=1, center=True)
c(window=2, min_periods=1, center=False)
# GH 13383
with pytest.raises(ValueError):
c(0)
c(-1)
# not valid
for w in [2., 'foo', np.array([2])]:
with pytest.raises(ValueError):
c(window=w)
with pytest.raises(ValueError):
c(window=2, min_periods=w)
with pytest.raises(ValueError):
c(window=2, min_periods=1, center=w)
@td.skip_if_no_scipy
@pytest.mark.parametrize(
'which', ['series', 'frame'])
def test_constructor_with_win_type(self, which):
# GH 13383
o = getattr(self, which)
c = o.rolling
with pytest.raises(ValueError):
c(-1, win_type='boxcar')
@pytest.mark.parametrize(
'window', [timedelta(days=3), pd.Timedelta(days=3)])
def test_constructor_with_timedelta_window(self, window):
# GH 15440
n = 10
df = DataFrame({'value': np.arange(n)},
index=pd.date_range('2015-12-24', periods=n, freq="D"))
expected_data = np.append([0., 1.], np.arange(3., 27., 3))
result = df.rolling(window=window).sum()
expected = DataFrame({'value': expected_data},
index=pd.date_range('2015-12-24', periods=n,
freq="D"))
tm.assert_frame_equal(result, expected)
expected = df.rolling('3D').sum()
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize(
'window', [timedelta(days=3), pd.Timedelta(days=3), '3D'])
def test_constructor_timedelta_window_and_minperiods(self, window, raw):
# GH 15305
n = 10
df = DataFrame({'value': np.arange(n)},
index=pd.date_range('2017-08-08', periods=n, freq="D"))
expected = DataFrame(
{'value': np.append([np.NaN, 1.], np.arange(3., 27., 3))},
index=pd.date_range('2017-08-08', periods=n, freq="D"))
result_roll_sum = df.rolling(window=window, min_periods=2).sum()
result_roll_generic = df.rolling(window=window,
min_periods=2).apply(sum, raw=raw)
tm.assert_frame_equal(result_roll_sum, expected)
tm.assert_frame_equal(result_roll_generic, expected)
@pytest.mark.parametrize(
'method', ['std', 'mean', 'sum', 'max', 'min', 'var'])
def test_numpy_compat(self, method):
# see gh-12811
r = rwindow.Rolling(Series([2, 4, 6]), window=2)
msg = "numpy operations are not valid with window objects"
tm.assert_raises_regex(UnsupportedFunctionCall, msg,
getattr(r, method), 1, 2, 3)
tm.assert_raises_regex(UnsupportedFunctionCall, msg,
getattr(r, method), dtype=np.float64)
def test_closed(self):
df = DataFrame({'A': [0, 1, 2, 3, 4]})
# closed only allowed for datetimelike
with pytest.raises(ValueError):
df.rolling(window=3, closed='neither')
@pytest.mark.parametrize('roller', ['1s', 1])
def tests_empty_df_rolling(self, roller):
# GH 15819 Verifies that datetime and integer rolling windows can be
# applied to empty DataFrames
expected = DataFrame()
result = DataFrame().rolling(roller).sum()
tm.assert_frame_equal(result, expected)
# Verifies that datetime and integer rolling windows can be applied to
# empty DataFrames with datetime index
expected = DataFrame(index=pd.DatetimeIndex([]))
result = DataFrame(index=pd.DatetimeIndex([])).rolling(roller).sum()
tm.assert_frame_equal(result, expected)
def test_missing_minp_zero(self):
# https://github.com/pandas-dev/pandas/pull/18921
# minp=0
x = pd.Series([np.nan])
result = x.rolling(1, min_periods=0).sum()
expected = pd.Series([0.0])
tm.assert_series_equal(result, expected)
# minp=1
result = x.rolling(1, min_periods=1).sum()
expected = pd.Series([np.nan])
tm.assert_series_equal(result, expected)
def test_missing_minp_zero_variable(self):
# https://github.com/pandas-dev/pandas/pull/18921
x = pd.Series([np.nan] * 4,
index=pd.DatetimeIndex(['2017-01-01', '2017-01-04',
'2017-01-06', '2017-01-07']))
result = x.rolling(pd.Timedelta("2d"), min_periods=0).sum()
expected = pd.Series(0.0, index=x.index)
tm.assert_series_equal(result, expected)
def test_multi_index_names(self):
# GH 16789, 16825
cols = pd.MultiIndex.from_product([['A', 'B'], ['C', 'D', 'E']],
names=['1', '2'])
df = DataFrame(np.ones((10, 6)), columns=cols)
result = df.rolling(3).cov()
tm.assert_index_equal(result.columns, df.columns)
assert result.index.names == [None, '1', '2']
@pytest.mark.parametrize('klass', [pd.Series, pd.DataFrame])
def test_iter_raises(self, klass):
# https://github.com/pandas-dev/pandas/issues/11704
# Iteration over a Window
obj = klass([1, 2, 3, 4])
with pytest.raises(NotImplementedError):
iter(obj.rolling(2))
class TestExpanding(Base):
def setup_method(self, method):
self._create_data()
def test_doc_string(self):
df = DataFrame({'B': [0, 1, 2, np.nan, 4]})
df
df.expanding(2).sum()
@pytest.mark.parametrize(
'which', ['series', 'frame'])
def test_constructor(self, which):
# GH 12669
o = getattr(self, which)
c = o.expanding
# valid
c(min_periods=1)
c(min_periods=1, center=True)
c(min_periods=1, center=False)
# not valid
for w in [2., 'foo', np.array([2])]:
with pytest.raises(ValueError):
c(min_periods=w)
with pytest.raises(ValueError):
c(min_periods=1, center=w)
@pytest.mark.parametrize(
'method', ['std', 'mean', 'sum', 'max', 'min', 'var'])
def test_numpy_compat(self, method):
# see gh-12811
e = rwindow.Expanding(Series([2, 4, 6]), window=2)
msg = "numpy operations are not valid with window objects"
tm.assert_raises_regex(UnsupportedFunctionCall, msg,
getattr(e, method), 1, 2, 3)
tm.assert_raises_regex(UnsupportedFunctionCall, msg,
getattr(e, method), dtype=np.float64)
@pytest.mark.parametrize(
'expander',
[1, pytest.param('ls', marks=pytest.mark.xfail(
reason='GH 16425 expanding with '
'offset not supported'))])
def test_empty_df_expanding(self, expander):
# GH 15819 Verifies that datetime and integer expanding windows can be
# applied to empty DataFrames
expected = DataFrame()
result = DataFrame().expanding(expander).sum()
tm.assert_frame_equal(result, expected)
# Verifies that datetime and integer expanding windows can be applied
# to empty DataFrames with datetime index
expected = DataFrame(index=pd.DatetimeIndex([]))
result = DataFrame(
index=pd.DatetimeIndex([])).expanding(expander).sum()
tm.assert_frame_equal(result, expected)
def test_missing_minp_zero(self):
# https://github.com/pandas-dev/pandas/pull/18921
# minp=0
x = pd.Series([np.nan])
result = x.expanding(min_periods=0).sum()
expected = pd.Series([0.0])
tm.assert_series_equal(result, expected)
# minp=1
result = x.expanding(min_periods=1).sum()
expected = pd.Series([np.nan])
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize('klass', [pd.Series, pd.DataFrame])
def test_iter_raises(self, klass):
# https://github.com/pandas-dev/pandas/issues/11704
# Iteration over a Window
obj = klass([1, 2, 3, 4])
with pytest.raises(NotImplementedError):
iter(obj.expanding(2))
class TestEWM(Base):
def setup_method(self, method):
self._create_data()
def test_doc_string(self):
df = DataFrame({'B': [0, 1, 2, np.nan, 4]})
df
df.ewm(com=0.5).mean()
@pytest.mark.parametrize(
'which', ['series', 'frame'])
def test_constructor(self, which):
o = getattr(self, which)
c = o.ewm
# valid
c(com=0.5)
c(span=1.5)
c(alpha=0.5)
c(halflife=0.75)
c(com=0.5, span=None)
c(alpha=0.5, com=None)
c(halflife=0.75, alpha=None)
# not valid: mutually exclusive
with pytest.raises(ValueError):
c(com=0.5, alpha=0.5)
with pytest.raises(ValueError):
c(span=1.5, halflife=0.75)
with pytest.raises(ValueError):
c(alpha=0.5, span=1.5)
# not valid: com < 0
with pytest.raises(ValueError):
c(com=-0.5)
# not valid: span < 1
with pytest.raises(ValueError):
c(span=0.5)
# not valid: halflife <= 0
with pytest.raises(ValueError):
c(halflife=0)
# not valid: alpha <= 0 or alpha > 1
for alpha in (-0.5, 1.5):
with pytest.raises(ValueError):
c(alpha=alpha)
@pytest.mark.parametrize(
'method', ['std', 'mean', 'var'])
def test_numpy_compat(self, method):
# see gh-12811
e = rwindow.EWM(Series([2, 4, 6]), alpha=0.5)
msg = "numpy operations are not valid with window objects"
tm.assert_raises_regex(UnsupportedFunctionCall, msg,
getattr(e, method), 1, 2, 3)
tm.assert_raises_regex(UnsupportedFunctionCall, msg,
getattr(e, method), dtype=np.float64)
# gh-12373 : rolling functions error on float32 data
# make sure rolling functions works for different dtypes
#
# NOTE that these are yielded tests and so _create_data
# is explicitly called.
#
# further note that we are only checking rolling for fully dtype
# compliance (though both expanding and ewm inherit)
class Dtype(object):
window = 2
funcs = {
'count': lambda v: v.count(),
'max': lambda v: v.max(),
'min': lambda v: v.min(),
'sum': lambda v: v.sum(),
'mean': lambda v: v.mean(),
'std': lambda v: v.std(),
'var': lambda v: v.var(),
'median': lambda v: v.median()
}
def get_expects(self):
expects = {
'sr1': {
'count': Series([1, 2, 2, 2, 2], dtype='float64'),
'max': Series([np.nan, 1, 2, 3, 4], dtype='float64'),
'min': Series([np.nan, 0, 1, 2, 3], dtype='float64'),
'sum': Series([np.nan, 1, 3, 5, 7], dtype='float64'),
'mean': Series([np.nan, .5, 1.5, 2.5, 3.5], dtype='float64'),
'std': Series([np.nan] + [np.sqrt(.5)] * 4, dtype='float64'),
'var': Series([np.nan, .5, .5, .5, .5], dtype='float64'),
'median': Series([np.nan, .5, 1.5, 2.5, 3.5], dtype='float64')
},
'sr2': {
'count': Series([1, 2, 2, 2, 2], dtype='float64'),
'max': Series([np.nan, 10, 8, 6, 4], dtype='float64'),
'min': Series([np.nan, 8, 6, 4, 2], dtype='float64'),
'sum': Series([np.nan, 18, 14, 10, 6], dtype='float64'),
'mean': Series([np.nan, 9, 7, 5, 3], dtype='float64'),
'std': Series([np.nan] + [np.sqrt(2)] * 4, dtype='float64'),
'var': Series([np.nan, 2, 2, 2, 2], dtype='float64'),
'median': Series([np.nan, 9, 7, 5, 3], dtype='float64')
},
'df': {
'count': DataFrame({0: Series([1, 2, 2, 2, 2]),
1: Series([1, 2, 2, 2, 2])},
dtype='float64'),
'max': DataFrame({0: Series([np.nan, 2, 4, 6, 8]),
1: Series([np.nan, 3, 5, 7, 9])},
dtype='float64'),
'min': DataFrame({0: Series([np.nan, 0, 2, 4, 6]),
1: Series([np.nan, 1, 3, 5, 7])},
dtype='float64'),
'sum': DataFrame({0: Series([np.nan, 2, 6, 10, 14]),
1: Series([np.nan, 4, 8, 12, 16])},
dtype='float64'),
'mean': DataFrame({0: Series([np.nan, 1, 3, 5, 7]),
1: Series([np.nan, 2, 4, 6, 8])},
dtype='float64'),
'std': DataFrame({0: Series([np.nan] + [np.sqrt(2)] * 4),
1: Series([np.nan] + [np.sqrt(2)] * 4)},
dtype='float64'),
'var': DataFrame({0: Series([np.nan, 2, 2, 2, 2]),
1: Series([np.nan, 2, 2, 2, 2])},
dtype='float64'),
'median': DataFrame({0: Series([np.nan, 1, 3, 5, 7]),
1: Series([np.nan, 2, 4, 6, 8])},
dtype='float64'),
}
}
return expects
def _create_dtype_data(self, dtype):
sr1 = Series(np.arange(5), dtype=dtype)
sr2 = Series(np.arange(10, 0, -2), dtype=dtype)
df = DataFrame(np.arange(10).reshape((5, 2)), dtype=dtype)
data = {
'sr1': sr1,
'sr2': sr2,
'df': df
}
return data
def _create_data(self):
self.data = self._create_dtype_data(self.dtype)
self.expects = self.get_expects()
def test_dtypes(self):
self._create_data()
for f_name, d_name in product(self.funcs.keys(), self.data.keys()):
f = self.funcs[f_name]
d = self.data[d_name]
exp = self.expects[d_name][f_name]
self.check_dtypes(f, f_name, d, d_name, exp)
def check_dtypes(self, f, f_name, d, d_name, exp):
roll = d.rolling(window=self.window)
result = f(roll)
tm.assert_almost_equal(result, exp)
class TestDtype_object(Dtype):
dtype = object
class Dtype_integer(Dtype):
pass
class TestDtype_int8(Dtype_integer):
dtype = np.int8
class TestDtype_int16(Dtype_integer):
dtype = np.int16
class TestDtype_int32(Dtype_integer):
dtype = np.int32
class TestDtype_int64(Dtype_integer):
dtype = np.int64
class Dtype_uinteger(Dtype):
pass
class TestDtype_uint8(Dtype_uinteger):
dtype = np.uint8
class TestDtype_uint16(Dtype_uinteger):
dtype = np.uint16
class TestDtype_uint32(Dtype_uinteger):
dtype = np.uint32
class TestDtype_uint64(Dtype_uinteger):
dtype = np.uint64
class Dtype_float(Dtype):
pass
class TestDtype_float16(Dtype_float):
dtype = np.float16
class TestDtype_float32(Dtype_float):
dtype = np.float32
class TestDtype_float64(Dtype_float):
dtype = np.float64
class TestDtype_category(Dtype):
dtype = 'category'
include_df = False
def _create_dtype_data(self, dtype):
sr1 = Series(range(5), dtype=dtype)
sr2 = Series(range(10, 0, -2), dtype=dtype)
data = {
'sr1': sr1,
'sr2': sr2
}
return data
class DatetimeLike(Dtype):
def check_dtypes(self, f, f_name, d, d_name, exp):
roll = d.rolling(window=self.window)
if f_name == 'count':
result = f(roll)
tm.assert_almost_equal(result, exp)
else:
# other methods not Implemented ATM
with pytest.raises(NotImplementedError):
f(roll)
class TestDtype_timedelta(DatetimeLike):
dtype = np.dtype('m8[ns]')
class TestDtype_datetime(DatetimeLike):
dtype = np.dtype('M8[ns]')
class TestDtype_datetime64UTC(DatetimeLike):
dtype = 'datetime64[ns, UTC]'
def _create_data(self):
pytest.skip("direct creation of extension dtype "
"datetime64[ns, UTC] is not supported ATM")
class TestMoments(Base):
def setup_method(self, method):
self._create_data()
def test_centered_axis_validation(self):
# ok
Series(np.ones(10)).rolling(window=3, center=True, axis=0).mean()
# bad axis
with pytest.raises(ValueError):
Series(np.ones(10)).rolling(window=3, center=True, axis=1).mean()
# ok ok
DataFrame(np.ones((10, 10))).rolling(window=3, center=True,
axis=0).mean()
DataFrame(np.ones((10, 10))).rolling(window=3, center=True,
axis=1).mean()
# bad axis
with pytest.raises(ValueError):
(DataFrame(np.ones((10, 10)))
.rolling(window=3, center=True, axis=2).mean())
def test_rolling_sum(self):
self._check_moment_func(np.nansum, name='sum',
zero_min_periods_equal=False)
def test_rolling_count(self):
counter = lambda x: np.isfinite(x).astype(float).sum()
self._check_moment_func(counter, name='count', has_min_periods=False,
fill_value=0)
def test_rolling_mean(self):
self._check_moment_func(np.mean, name='mean')
@td.skip_if_no_scipy
def test_cmov_mean(self):
# GH 8238
vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48,
10.63, 14.48])
result = Series(vals).rolling(5, center=True).mean()
expected = Series([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516,
12.818, 12.952, np.nan, np.nan])
tm.assert_series_equal(expected, result)
@td.skip_if_no_scipy
def test_cmov_window(self):
# GH 8238
vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48,
10.63, 14.48])
result = Series(vals).rolling(5, win_type='boxcar', center=True).mean()
expected = Series([np.nan, np.nan, 9.962, 11.27, 11.564, 12.516,
12.818, 12.952, np.nan, np.nan])
tm.assert_series_equal(expected, result)
@td.skip_if_no_scipy
def test_cmov_window_corner(self):
# GH 8238
# all nan
vals = pd.Series([np.nan] * 10)
result = vals.rolling(5, center=True, win_type='boxcar').mean()
assert np.isnan(result).all()
# empty
vals = pd.Series([])
result = vals.rolling(5, center=True, win_type='boxcar').mean()
assert len(result) == 0
# shorter than window
vals = pd.Series(np.random.randn(5))
result = vals.rolling(10, win_type='boxcar').mean()
assert np.isnan(result).all()
assert len(result) == 5
@td.skip_if_no_scipy
def test_cmov_window_frame(self):
# Gh 8238
vals = np.array([[12.18, 3.64], [10.18, 9.16], [13.24, 14.61],
[4.51, 8.11], [6.15, 11.44], [9.14, 6.21],
[11.31, 10.67], [2.94, 6.51], [9.42, 8.39], [12.44,
7.34]])
xp = np.array([[np.nan, np.nan], [np.nan, np.nan], [9.252, 9.392],
[8.644, 9.906], [8.87, 10.208], [6.81, 8.588],
[7.792, 8.644], [9.05, 7.824], [np.nan, np.nan
], [np.nan, np.nan]])
# DataFrame
rs = DataFrame(vals).rolling(5, win_type='boxcar', center=True).mean()
tm.assert_frame_equal(DataFrame(xp), rs)
# invalid method
with pytest.raises(AttributeError):
(DataFrame(vals).rolling(5, win_type='boxcar', center=True)
.std())
# sum
xp = np.array([[np.nan, np.nan], [np.nan, np.nan], [46.26, 46.96],
[43.22, 49.53], [44.35, 51.04], [34.05, 42.94],
[38.96, 43.22], [45.25, 39.12], [np.nan, np.nan
], [np.nan, np.nan]])
rs = DataFrame(vals).rolling(5, win_type='boxcar', center=True).sum()
tm.assert_frame_equal(DataFrame(xp), rs)
@td.skip_if_no_scipy
def test_cmov_window_na_min_periods(self):
# min_periods
vals = Series(np.random.randn(10))
vals[4] = np.nan
vals[8] = np.nan
xp = vals.rolling(5, min_periods=4, center=True).mean()
rs = vals.rolling(5, win_type='boxcar', min_periods=4,
center=True).mean()
tm.assert_series_equal(xp, rs)
@td.skip_if_no_scipy
def test_cmov_window_regular(self, win_types):
# GH 8238
vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48,
10.63, 14.48])
xps = {
'hamming': [np.nan, np.nan, 8.71384, 9.56348, 12.38009, 14.03687,
13.8567, 11.81473, np.nan, np.nan],
'triang': [np.nan, np.nan, 9.28667, 10.34667, 12.00556, 13.33889,
13.38, 12.33667, np.nan, np.nan],
'barthann': [np.nan, np.nan, 8.4425, 9.1925, 12.5575, 14.3675,
14.0825, 11.5675, np.nan, np.nan],
'bohman': [np.nan, np.nan, 7.61599, 9.1764, 12.83559, 14.17267,
14.65923, 11.10401, np.nan, np.nan],
'blackmanharris': [np.nan, np.nan, 6.97691, 9.16438, 13.05052,
14.02156, 15.10512, 10.74574, np.nan, np.nan],
'nuttall': [np.nan, np.nan, 7.04618, 9.16786, 13.02671, 14.03559,
15.05657, 10.78514, np.nan, np.nan],
'blackman': [np.nan, np.nan, 7.73345, 9.17869, 12.79607, 14.20036,
14.57726, 11.16988, np.nan, np.nan],
'bartlett': [np.nan, np.nan, 8.4425, 9.1925, 12.5575, 14.3675,
14.0825, 11.5675, np.nan, np.nan]
}
xp = Series(xps[win_types])
rs = Series(vals).rolling(5, win_type=win_types, center=True).mean()
tm.assert_series_equal(xp, rs)
@td.skip_if_no_scipy
def test_cmov_window_regular_linear_range(self, win_types):
# GH 8238
vals = np.array(range(10), dtype=np.float)
xp = vals.copy()
xp[:2] = np.nan
xp[-2:] = np.nan
xp = Series(xp)
rs = Series(vals).rolling(5, win_type=win_types, center=True).mean()
tm.assert_series_equal(xp, rs)
@td.skip_if_no_scipy
def test_cmov_window_regular_missing_data(self, win_types):
# GH 8238
vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, np.nan,
10.63, 14.48])
xps = {
'bartlett': [np.nan, np.nan, 9.70333, 10.5225, 8.4425, 9.1925,
12.5575, 14.3675, 15.61667, 13.655],
'blackman': [np.nan, np.nan, 9.04582, 11.41536, 7.73345, 9.17869,
12.79607, 14.20036, 15.8706, 13.655],
'barthann': [np.nan, np.nan, 9.70333, 10.5225, 8.4425, 9.1925,
12.5575, 14.3675, 15.61667, 13.655],
'bohman': [np.nan, np.nan, 8.9444, 11.56327, 7.61599, 9.1764,
12.83559, 14.17267, 15.90976, 13.655],
'hamming': [np.nan, np.nan, 9.59321, 10.29694, 8.71384, 9.56348,
12.38009, 14.20565, 15.24694, 13.69758],
'nuttall': [np.nan, np.nan, 8.47693, 12.2821, 7.04618, 9.16786,
13.02671, 14.03673, 16.08759, 13.65553],
'triang': [np.nan, np.nan, 9.33167, 9.76125, 9.28667, 10.34667,
12.00556, 13.82125, 14.49429, 13.765],
'blackmanharris': [np.nan, np.nan, 8.42526, 12.36824, 6.97691,
9.16438, 13.05052, 14.02175, 16.1098, 13.65509]
}
xp = Series(xps[win_types])
rs = Series(vals).rolling(5, win_type=win_types, min_periods=3).mean()
tm.assert_series_equal(xp, rs)
@td.skip_if_no_scipy
def test_cmov_window_special(self, win_types_special):
# GH 8238
kwds = {
'kaiser': {'beta': 1.},
'gaussian': {'std': 1.},
'general_gaussian': {'power': 2., 'width': 2.}}
vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48,
10.63, 14.48])
xps = {
'gaussian': [np.nan, np.nan, 8.97297, 9.76077, 12.24763, 13.89053,
13.65671, 12.01002, np.nan, np.nan],
'general_gaussian': [np.nan, np.nan, 9.85011, 10.71589, 11.73161,
13.08516, 12.95111, 12.74577, np.nan, np.nan],
'kaiser': [np.nan, np.nan, 9.86851, 11.02969, 11.65161, 12.75129,
12.90702, 12.83757, np.nan, np.nan]
}
xp = Series(xps[win_types_special])
rs = Series(vals).rolling(
5, win_type=win_types_special, center=True).mean(
**kwds[win_types_special])
tm.assert_series_equal(xp, rs)
@td.skip_if_no_scipy
def test_cmov_window_special_linear_range(self, win_types_special):
# GH 8238
kwds = {
'kaiser': {'beta': 1.},
'gaussian': {'std': 1.},
'general_gaussian': {'power': 2., 'width': 2.},
'slepian': {'width': 0.5}}
vals = np.array(range(10), dtype=np.float)
xp = vals.copy()
xp[:2] = np.nan
xp[-2:] = np.nan
xp = Series(xp)
rs = Series(vals).rolling(
5, win_type=win_types_special, center=True).mean(
**kwds[win_types_special])
tm.assert_series_equal(xp, rs)
def test_rolling_median(self):
self._check_moment_func(np.median, name='median')
def test_rolling_min(self):
self._check_moment_func(np.min, name='min')
a = pd.Series([1, 2, 3, 4, 5])
result = a.rolling(window=100, min_periods=1).min()
expected = pd.Series(np.ones(len(a)))
tm.assert_series_equal(result, expected)
with pytest.raises(ValueError):
pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).min()
def test_rolling_max(self):
self._check_moment_func(np.max, name='max')
a = pd.Series([1, 2, 3, 4, 5], dtype=np.float64)
b = a.rolling(window=100, min_periods=1).max()
tm.assert_almost_equal(a, b)
with pytest.raises(ValueError):
pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).max()
@pytest.mark.parametrize('q', [0.0, .1, .5, .9, 1.0])
def test_rolling_quantile(self, q):
def scoreatpercentile(a, per):
values = np.sort(a, axis=0)
idx = int(per / 1. * (values.shape[0] - 1))
if idx == values.shape[0] - 1:
retval = values[-1]
else:
qlow = float(idx) / float(values.shape[0] - 1)
qhig = float(idx + 1) / float(values.shape[0] - 1)
vlow = values[idx]
vhig = values[idx + 1]
retval = vlow + (vhig - vlow) * (per - qlow) / (qhig - qlow)
return retval
def quantile_func(x):
return scoreatpercentile(x, q)
self._check_moment_func(quantile_func, name='quantile',
quantile=q)
def test_rolling_quantile_np_percentile(self):
# #9413: Tests that rolling window's quantile default behavior
# is analogus to Numpy's percentile
row = 10
col = 5
idx = pd.date_range('20100101', periods=row, freq='B')
df = DataFrame(np.random.rand(row * col).reshape((row, -1)), index=idx)
df_quantile = df.quantile([0.25, 0.5, 0.75], axis=0)
np_percentile = np.percentile(df, [25, 50, 75], axis=0)
tm.assert_almost_equal(df_quantile.values, np.array(np_percentile))
@pytest.mark.skipif(_np_version_under1p12,
reason='numpy midpoint interpolation is broken')
@pytest.mark.parametrize('quantile', [0.0, 0.1, 0.45, 0.5, 1])
@pytest.mark.parametrize('interpolation', ['linear', 'lower', 'higher',
'nearest', 'midpoint'])
@pytest.mark.parametrize('data', [[1., 2., 3., 4., 5., 6., 7.],
[8., 1., 3., 4., 5., 2., 6., 7.],
[0., np.nan, 0.2, np.nan, 0.4],
[np.nan, np.nan, np.nan, np.nan],
[np.nan, 0.1, np.nan, 0.3, 0.4, 0.5],
[0.5], [np.nan, 0.7, 0.6]])
def test_rolling_quantile_interpolation_options(self, quantile,
interpolation, data):
# Tests that rolling window's quantile behavior is analogous to
# Series' quantile for each interpolation option
s = Series(data)
q1 = s.quantile(quantile, interpolation)
q2 = s.expanding(min_periods=1).quantile(
quantile, interpolation).iloc[-1]
if np.isnan(q1):
assert np.isnan(q2)
else:
assert q1 == q2
def test_invalid_quantile_value(self):
data = np.arange(5)
s = Series(data)
with pytest.raises(ValueError, match="Interpolation 'invalid'"
" is not supported"):
s.rolling(len(data), min_periods=1).quantile(
0.5, interpolation='invalid')
def test_rolling_quantile_param(self):
ser = Series([0.0, .1, .5, .9, 1.0])
with pytest.raises(ValueError):
ser.rolling(3).quantile(-0.1)
with pytest.raises(ValueError):
ser.rolling(3).quantile(10.0)
with pytest.raises(TypeError):
ser.rolling(3).quantile('foo')
def test_rolling_apply(self, raw):
# suppress warnings about empty slices, as we are deliberately testing
# with a 0-length Series
with warnings.catch_warnings():
warnings.filterwarnings("ignore",
message=".*(empty slice|0 for slice).*",
category=RuntimeWarning)
def f(x):
return x[np.isfinite(x)].mean()
self._check_moment_func(np.mean, name='apply', func=f, raw=raw)
expected = Series([])
result = expected.rolling(10).apply(lambda x: x.mean(), raw=raw)
tm.assert_series_equal(result, expected)
# gh-8080
s = Series([None, None, None])
result = s.rolling(2, min_periods=0).apply(lambda x: len(x), raw=raw)
expected = Series([1., 2., 2.])
tm.assert_series_equal(result, expected)
result = s.rolling(2, min_periods=0).apply(len, raw=raw)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize('klass', [Series, DataFrame])
@pytest.mark.parametrize(
'method', [lambda x: x.rolling(window=2), lambda x: x.expanding()])
def test_apply_future_warning(self, klass, method):
# gh-5071
s = klass(np.arange(3))
with tm.assert_produces_warning(FutureWarning):
method(s).apply(lambda x: len(x))
def test_rolling_apply_out_of_bounds(self, raw):
# gh-1850
vals = pd.Series([1, 2, 3, 4])
result = vals.rolling(10).apply(np.sum, raw=raw)
assert result.isna().all()
result = vals.rolling(10, min_periods=1).apply(np.sum, raw=raw)
expected = pd.Series([1, 3, 6, 10], dtype=float)
tm.assert_almost_equal(result, expected)
@pytest.mark.parametrize('window', [2, '2s'])
def test_rolling_apply_with_pandas_objects(self, window):
# 5071
df = pd.DataFrame({'A': np.random.randn(5),
'B': np.random.randint(0, 10, size=5)},
index=pd.date_range('20130101', periods=5, freq='s'))
# we have an equal spaced timeseries index
# so simulate removing the first period
def f(x):
if x.index[0] == df.index[0]:
return np.nan
return x.iloc[-1]
result = df.rolling(window).apply(f, raw=False)
expected = df.iloc[2:].reindex_like(df)
tm.assert_frame_equal(result, expected)
with pytest.raises(AttributeError):
df.rolling(window).apply(f, raw=True)
def test_rolling_std(self):
self._check_moment_func(lambda x: np.std(x, ddof=1),
name='std')
self._check_moment_func(lambda x: np.std(x, ddof=0),
name='std', ddof=0)
def test_rolling_std_1obs(self):
vals = pd.Series([1., 2., 3., 4., 5.])
result = vals.rolling(1, min_periods=1).std()
expected = pd.Series([np.nan] * 5)
tm.assert_series_equal(result, expected)
result = vals.rolling(1, min_periods=1).std(ddof=0)
expected = pd.Series([0.] * 5)
tm.assert_series_equal(result, expected)
result = (pd.Series([np.nan, np.nan, 3, 4, 5])
.rolling(3, min_periods=2).std())
assert np.isnan(result[2])
def test_rolling_std_neg_sqrt(self):
# unit test from Bottleneck
# Test move_nanstd for neg sqrt.
a = pd.Series([0.0011448196318903589, 0.00028718669878572767,
0.00028718669878572767, 0.00028718669878572767,
0.00028718669878572767])
b = a.rolling(window=3).std()
assert np.isfinite(b[2:]).all()
b = a.ewm(span=3).std()
assert np.isfinite(b[2:]).all()
def test_rolling_var(self):
self._check_moment_func(lambda x: np.var(x, ddof=1),
name='var')
self._check_moment_func(lambda x: np.var(x, ddof=0),
name='var', ddof=0)
@td.skip_if_no_scipy
def test_rolling_skew(self):
from scipy.stats import skew
self._check_moment_func(lambda x: skew(x, bias=False), name='skew')
@td.skip_if_no_scipy
def test_rolling_kurt(self):
from scipy.stats import kurtosis
self._check_moment_func(lambda x: kurtosis(x, bias=False),
name='kurt')
def _check_moment_func(self, static_comp, name, has_min_periods=True,
has_center=True, has_time_rule=True,
fill_value=None, zero_min_periods_equal=True,
**kwargs):
def get_result(obj, window, min_periods=None, center=False):
r = obj.rolling(window=window, min_periods=min_periods,
center=center)
return getattr(r, name)(**kwargs)
series_result = get_result(self.series, window=50)
assert isinstance(series_result, Series)
tm.assert_almost_equal(series_result.iloc[-1],
static_comp(self.series[-50:]))
frame_result = get_result(self.frame, window=50)
assert isinstance(frame_result, DataFrame)
tm.assert_series_equal(
frame_result.iloc[-1, :],
self.frame.iloc[-50:, :].apply(static_comp, axis=0, raw=raw),
check_names=False)
# check time_rule works
if has_time_rule:
win = 25
minp = 10
series = self.series[::2].resample('B').mean()
frame = self.frame[::2].resample('B').mean()
if has_min_periods:
series_result = get_result(series, window=win,
min_periods=minp)
frame_result = get_result(frame, window=win,
min_periods=minp)
else:
series_result = get_result(series, window=win)
frame_result = get_result(frame, window=win)
last_date = series_result.index[-1]
prev_date = last_date - 24 * offsets.BDay()
trunc_series = self.series[::2].truncate(prev_date, last_date)
trunc_frame = self.frame[::2].truncate(prev_date, last_date)
tm.assert_almost_equal(series_result[-1],
static_comp(trunc_series))
tm.assert_series_equal(frame_result.xs(last_date),
trunc_frame.apply(static_comp, raw=raw),
check_names=False)
# excluding NaNs correctly
obj = Series(randn(50))
obj[:10] = np.NaN
obj[-10:] = np.NaN
if has_min_periods:
result = get_result(obj, 50, min_periods=30)
tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10]))
# min_periods is working correctly
result = get_result(obj, 20, min_periods=15)
assert isna(result.iloc[23])
assert not isna(result.iloc[24])
assert not isna(result.iloc[-6])
assert isna(result.iloc[-5])
obj2 = Series(randn(20))
result = get_result(obj2, 10, min_periods=5)
assert isna(result.iloc[3])
assert notna(result.iloc[4])
if zero_min_periods_equal:
# min_periods=0 may be equivalent to min_periods=1
result0 = get_result(obj, 20, min_periods=0)
result1 = get_result(obj, 20, min_periods=1)
tm.assert_almost_equal(result0, result1)
else:
result = get_result(obj, 50)
tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10]))
# window larger than series length (#7297)
if has_min_periods:
for minp in (0, len(self.series) - 1, len(self.series)):
result = get_result(self.series, len(self.series) + 1,
min_periods=minp)
expected = get_result(self.series, len(self.series),
min_periods=minp)
nan_mask = isna(result)
tm.assert_series_equal(nan_mask, isna(expected))
nan_mask = ~nan_mask
tm.assert_almost_equal(result[nan_mask],
expected[nan_mask])
else:
result = get_result(self.series, len(self.series) + 1)
expected = get_result(self.series, len(self.series))
nan_mask = isna(result)
tm.assert_series_equal(nan_mask, isna(expected))
nan_mask = ~nan_mask
tm.assert_almost_equal(result[nan_mask], expected[nan_mask])
# check center=True
if has_center:
if has_min_periods:
result = get_result(obj, 20, min_periods=15, center=True)
expected = get_result(
pd.concat([obj, Series([np.NaN] * 9)]), 20,
min_periods=15)[9:].reset_index(drop=True)
else:
result = get_result(obj, 20, center=True)
expected = get_result(
pd.concat([obj, Series([np.NaN] * 9)]),
20)[9:].reset_index(drop=True)
tm.assert_series_equal(result, expected)
# shifter index
s = ['x%d' % x for x in range(12)]
if has_min_periods:
minp = 10
series_xp = get_result(
self.series.reindex(list(self.series.index) + s),
window=25,
min_periods=minp).shift(-12).reindex(self.series.index)
frame_xp = get_result(
self.frame.reindex(list(self.frame.index) + s),
window=25,
min_periods=minp).shift(-12).reindex(self.frame.index)
series_rs = get_result(self.series, window=25,
min_periods=minp, center=True)
frame_rs = get_result(self.frame, window=25, min_periods=minp,
center=True)
else:
series_xp = get_result(
self.series.reindex(list(self.series.index) + s),
window=25).shift(-12).reindex(self.series.index)
frame_xp = get_result(
self.frame.reindex(list(self.frame.index) + s),
window=25).shift(-12).reindex(self.frame.index)
series_rs = get_result(self.series, window=25, center=True)
frame_rs = get_result(self.frame, window=25, center=True)
if fill_value is not None:
series_xp = series_xp.fillna(fill_value)
frame_xp = frame_xp.fillna(fill_value)
tm.assert_series_equal(series_xp, series_rs)
tm.assert_frame_equal(frame_xp, frame_rs)
def test_ewma(self):
self._check_ew(name='mean')
vals = pd.Series(np.zeros(1000))
vals[5] = 1
result = vals.ewm(span=100, adjust=False).mean().sum()
assert np.abs(result - 1) < 1e-2
@pytest.mark.parametrize('adjust', [True, False])
@pytest.mark.parametrize('ignore_na', [True, False])
def test_ewma_cases(self, adjust, ignore_na):
# try adjust/ignore_na args matrix
s = Series([1.0, 2.0, 4.0, 8.0])
if adjust:
expected = Series([1.0, 1.6, 2.736842, 4.923077])
else:
expected = Series([1.0, 1.333333, 2.222222, 4.148148])
result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean()
tm.assert_series_equal(result, expected)
def test_ewma_nan_handling(self):
s = Series([1.] + [np.nan] * 5 + [1.])
result = s.ewm(com=5).mean()
tm.assert_series_equal(result, Series([1.] * len(s)))
s = Series([np.nan] * 2 + [1.] + [np.nan] * 2 + [1.])
result = s.ewm(com=5).mean()
tm.assert_series_equal(result, Series([np.nan] * 2 + [1.] * 4))
# GH 7603
s0 = Series([np.nan, 1., 101.])
s1 = Series([1., np.nan, 101.])
s2 = Series([np.nan, 1., np.nan, np.nan, 101., np.nan])
s3 = Series([1., np.nan, 101., 50.])
com = 2.
alpha = 1. / (1. + com)
def simple_wma(s, w):
return (s.multiply(w).cumsum() / w.cumsum()).fillna(method='ffill')
for (s, adjust, ignore_na, w) in [
(s0, True, False, [np.nan, (1. - alpha), 1.]),
(s0, True, True, [np.nan, (1. - alpha), 1.]),
(s0, False, False, [np.nan, (1. - alpha), alpha]),
(s0, False, True, [np.nan, (1. - alpha), alpha]),
(s1, True, False, [(1. - alpha) ** 2, np.nan, 1.]),
(s1, True, True, [(1. - alpha), np.nan, 1.]),
(s1, False, False, [(1. - alpha) ** 2, np.nan, alpha]),
(s1, False, True, [(1. - alpha), np.nan, alpha]),
(s2, True, False, [np.nan, (1. - alpha) **
3, np.nan, np.nan, 1., np.nan]),
(s2, True, True, [np.nan, (1. - alpha),
np.nan, np.nan, 1., np.nan]),
(s2, False, False, [np.nan, (1. - alpha) **
3, np.nan, np.nan, alpha, np.nan]),
(s2, False, True, [np.nan, (1. - alpha),
np.nan, np.nan, alpha, np.nan]),
(s3, True, False, [(1. - alpha) **
3, np.nan, (1. - alpha), 1.]),
(s3, True, True, [(1. - alpha) **
2, np.nan, (1. - alpha), 1.]),
(s3, False, False, [(1. - alpha) ** 3, np.nan,
(1. - alpha) * alpha,
alpha * ((1. - alpha) ** 2 + alpha)]),
(s3, False, True, [(1. - alpha) ** 2,
np.nan, (1. - alpha) * alpha, alpha])]:
expected = simple_wma(s, Series(w))
result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean()
tm.assert_series_equal(result, expected)
if ignore_na is False:
# check that ignore_na defaults to False
result = s.ewm(com=com, adjust=adjust).mean()
tm.assert_series_equal(result, expected)
def test_ewmvar(self):
self._check_ew(name='var')
def test_ewmvol(self):
self._check_ew(name='vol')
def test_ewma_span_com_args(self):
A = self.series.ewm(com=9.5).mean()
B = self.series.ewm(span=20).mean()
tm.assert_almost_equal(A, B)
with pytest.raises(ValueError):
self.series.ewm(com=9.5, span=20)
with pytest.raises(ValueError):
self.series.ewm().mean()
def test_ewma_halflife_arg(self):
A = self.series.ewm(com=13.932726172912965).mean()
B = self.series.ewm(halflife=10.0).mean()
tm.assert_almost_equal(A, B)
with pytest.raises(ValueError):
self.series.ewm(span=20, halflife=50)
with pytest.raises(ValueError):
self.series.ewm(com=9.5, halflife=50)
with pytest.raises(ValueError):
self.series.ewm(com=9.5, span=20, halflife=50)
with pytest.raises(ValueError):
self.series.ewm()
def test_ewm_alpha(self):
# GH 10789
s = Series(self.arr)
a = s.ewm(alpha=0.61722699889169674).mean()
b = s.ewm(com=0.62014947789973052).mean()
c = s.ewm(span=2.240298955799461).mean()
d = s.ewm(halflife=0.721792864318).mean()
tm.assert_series_equal(a, b)
tm.assert_series_equal(a, c)
tm.assert_series_equal(a, d)
def test_ewm_alpha_arg(self):
# GH 10789
s = self.series
with pytest.raises(ValueError):
s.ewm()
with pytest.raises(ValueError):
s.ewm(com=10.0, alpha=0.5)
with pytest.raises(ValueError):
s.ewm(span=10.0, alpha=0.5)
with pytest.raises(ValueError):
s.ewm(halflife=10.0, alpha=0.5)
def test_ewm_domain_checks(self):
# GH 12492
s = Series(self.arr)
# com must satisfy: com >= 0
pytest.raises(ValueError, s.ewm, com=-0.1)
s.ewm(com=0.0)
s.ewm(com=0.1)
# span must satisfy: span >= 1
pytest.raises(ValueError, s.ewm, span=-0.1)
pytest.raises(ValueError, s.ewm, span=0.0)
pytest.raises(ValueError, s.ewm, span=0.9)
s.ewm(span=1.0)
s.ewm(span=1.1)
# halflife must satisfy: halflife > 0
pytest.raises(ValueError, s.ewm, halflife=-0.1)
pytest.raises(ValueError, s.ewm, halflife=0.0)
s.ewm(halflife=0.1)
# alpha must satisfy: 0 < alpha <= 1
pytest.raises(ValueError, s.ewm, alpha=-0.1)
pytest.raises(ValueError, s.ewm, alpha=0.0)
s.ewm(alpha=0.1)
s.ewm(alpha=1.0)
pytest.raises(ValueError, s.ewm, alpha=1.1)
@pytest.mark.parametrize('method', ['mean', 'vol', 'var'])
def test_ew_empty_series(self, method):
vals = pd.Series([], dtype=np.float64)
ewm = vals.ewm(3)
result = getattr(ewm, method)()
tm.assert_almost_equal(result, vals)
def _check_ew(self, name=None, preserve_nan=False):
series_result = getattr(self.series.ewm(com=10), name)()
assert isinstance(series_result, Series)
frame_result = getattr(self.frame.ewm(com=10), name)()
assert type(frame_result) == DataFrame
result = getattr(self.series.ewm(com=10), name)()
if preserve_nan:
assert result[self._nan_locs].isna().all()
# excluding NaNs correctly
arr = randn(50)
arr[:10] = np.NaN
arr[-10:] = np.NaN
s = Series(arr)
# check min_periods
# GH 7898
result = getattr(s.ewm(com=50, min_periods=2), name)()
assert result[:11].isna().all()
assert not result[11:].isna().any()
for min_periods in (0, 1):
result = getattr(s.ewm(com=50, min_periods=min_periods), name)()
if name == 'mean':
assert result[:10].isna().all()
assert not result[10:].isna().any()
else:
# ewm.std, ewm.vol, ewm.var (with bias=False) require at least
# two values
assert result[:11].isna().all()
assert not result[11:].isna().any()
# check series of length 0
result = getattr(Series().ewm(com=50, min_periods=min_periods),
name)()
tm.assert_series_equal(result, Series())
# check series of length 1
result = getattr(Series([1.]).ewm(50, min_periods=min_periods),
name)()
if name == 'mean':
tm.assert_series_equal(result, Series([1.]))
else:
# ewm.std, ewm.vol, ewm.var with bias=False require at least
# two values
tm.assert_series_equal(result, Series([np.NaN]))
# pass in ints
result2 = getattr(Series(np.arange(50)).ewm(span=10), name)()
assert result2.dtype == np.float_
class TestPairwise(object):
# GH 7738
df1s = [DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]),
DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]),
DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]),
DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]],
columns=['C', 'C']),
DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1., 0]),
DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0., 1]),
DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=['C', 1]),
DataFrame([[2., 4.], [1., 2.], [5., 2.], [8., 1.]],
columns=[1, 0.]),
DataFrame([[2, 4.], [1, 2.], [5, 2.], [8, 1.]],
columns=[0, 1.]),
DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.]],
columns=[1., 'X']), ]
df2 = DataFrame([[None, 1, 1], [None, 1, 2],
[None, 3, 2], [None, 8, 1]], columns=['Y', 'Z', 'X'])
s = Series([1, 1, 3, 8])
def compare(self, result, expected):
# since we have sorted the results
# we can only compare non-nans
result = result.dropna().values
expected = expected.dropna().values
tm.assert_numpy_array_equal(result, expected, check_dtype=False)
@pytest.mark.parametrize('f', [lambda x: x.cov(), lambda x: x.corr()])
def test_no_flex(self, f):
# DataFrame methods (which do not call _flex_binary_moment())
results = [f(df) for df in self.df1s]
for (df, result) in zip(self.df1s, results):
tm.assert_index_equal(result.index, df.columns)
tm.assert_index_equal(result.columns, df.columns)
for i, result in enumerate(results):
if i > 0:
self.compare(result, results[0])
@pytest.mark.parametrize(
'f', [lambda x: x.expanding().cov(pairwise=True),
lambda x: x.expanding().corr(pairwise=True),
lambda x: x.rolling(window=3).cov(pairwise=True),
lambda x: x.rolling(window=3).corr(pairwise=True),
lambda x: x.ewm(com=3).cov(pairwise=True),
lambda x: x.ewm(com=3).corr(pairwise=True)])
def test_pairwise_with_self(self, f):
# DataFrame with itself, pairwise=True
# note that we may construct the 1st level of the MI
# in a non-motononic way, so compare accordingly
results = []
for i, df in enumerate(self.df1s):
result = f(df)
tm.assert_index_equal(result.index.levels[0],
df.index,
check_names=False)
tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]),
safe_sort(df.columns.unique()))
tm.assert_index_equal(result.columns, df.columns)
results.append(df)
for i, result in enumerate(results):
if i > 0:
self.compare(result, results[0])
@pytest.mark.parametrize(
'f', [lambda x: x.expanding().cov(pairwise=False),
lambda x: x.expanding().corr(pairwise=False),
lambda x: x.rolling(window=3).cov(pairwise=False),
lambda x: x.rolling(window=3).corr(pairwise=False),
lambda x: x.ewm(com=3).cov(pairwise=False),
lambda x: x.ewm(com=3).corr(pairwise=False), ])
def test_no_pairwise_with_self(self, f):
# DataFrame with itself, pairwise=False
results = [f(df) for df in self.df1s]
for (df, result) in zip(self.df1s, results):
tm.assert_index_equal(result.index, df.index)
tm.assert_index_equal(result.columns, df.columns)
for i, result in enumerate(results):
if i > 0:
self.compare(result, results[0])
@pytest.mark.parametrize(
'f', [lambda x, y: x.expanding().cov(y, pairwise=True),
lambda x, y: x.expanding().corr(y, pairwise=True),
lambda x, y: x.rolling(window=3).cov(y, pairwise=True),
lambda x, y: x.rolling(window=3).corr(y, pairwise=True),
lambda x, y: x.ewm(com=3).cov(y, pairwise=True),
lambda x, y: x.ewm(com=3).corr(y, pairwise=True), ])
def test_pairwise_with_other(self, f):
# DataFrame with another DataFrame, pairwise=True
results = [f(df, self.df2) for df in self.df1s]
for (df, result) in zip(self.df1s, results):
tm.assert_index_equal(result.index.levels[0],
df.index,
check_names=False)
tm.assert_numpy_array_equal(safe_sort(result.index.levels[1]),
safe_sort(self.df2.columns.unique()))
for i, result in enumerate(results):
if i > 0:
self.compare(result, results[0])
@pytest.mark.parametrize(
'f', [lambda x, y: x.expanding().cov(y, pairwise=False),
lambda x, y: x.expanding().corr(y, pairwise=False),
lambda x, y: x.rolling(window=3).cov(y, pairwise=False),
lambda x, y: x.rolling(window=3).corr(y, pairwise=False),
lambda x, y: x.ewm(com=3).cov(y, pairwise=False),
lambda x, y: x.ewm(com=3).corr(y, pairwise=False), ])
def test_no_pairwise_with_other(self, f):
# DataFrame with another DataFrame, pairwise=False
results = [f(df, self.df2) if df.columns.is_unique else None
for df in self.df1s]
for (df, result) in zip(self.df1s, results):
if result is not None:
with catch_warnings(record=True):
# we can have int and str columns
expected_index = df.index.union(self.df2.index)
expected_columns = df.columns.union(self.df2.columns)
tm.assert_index_equal(result.index, expected_index)
tm.assert_index_equal(result.columns, expected_columns)
else:
tm.assert_raises_regex(
ValueError, "'arg1' columns are not unique", f, df,
self.df2)
tm.assert_raises_regex(
ValueError, "'arg2' columns are not unique", f,
self.df2, df)
@pytest.mark.parametrize(
'f', [lambda x, y: x.expanding().cov(y),
lambda x, y: x.expanding().corr(y),
lambda x, y: x.rolling(window=3).cov(y),
lambda x, y: x.rolling(window=3).corr(y),
lambda x, y: x.ewm(com=3).cov(y),
lambda x, y: x.ewm(com=3).corr(y), ])
def test_pairwise_with_series(self, f):
# DataFrame with a Series
results = ([f(df, self.s) for df in self.df1s] +
[f(self.s, df) for df in self.df1s])
for (df, result) in zip(self.df1s, results):
tm.assert_index_equal(result.index, df.index)
tm.assert_index_equal(result.columns, df.columns)
for i, result in enumerate(results):
if i > 0:
self.compare(result, results[0])
# create the data only once as we are not setting it
def _create_consistency_data():
def create_series():
return [Series(),
Series([np.nan]),
Series([np.nan, np.nan]),
Series([3.]),
Series([np.nan, 3.]),
Series([3., np.nan]),
Series([1., 3.]),
Series([2., 2.]),
Series([3., 1.]),
Series([5., 5., 5., 5., np.nan, np.nan, np.nan, 5., 5., np.nan,
np.nan]),
Series([np.nan, 5., 5., 5., np.nan, np.nan, np.nan, 5., 5.,
np.nan, np.nan]),
Series([np.nan, np.nan, 5., 5., np.nan, np.nan, np.nan, 5., 5.,
np.nan, np.nan]),
Series([np.nan, 3., np.nan, 3., 4., 5., 6., np.nan, np.nan, 7.,
12., 13., 14., 15.]),
Series([np.nan, 5., np.nan, 2., 4., 0., 9., np.nan, np.nan, 3.,
12., 13., 14., 15.]),
Series([2., 3., np.nan, 3., 4., 5., 6., np.nan, np.nan, 7.,
12., 13., 14., 15.]),
Series([2., 5., np.nan, 2., 4., 0., 9., np.nan, np.nan, 3.,
12., 13., 14., 15.]),
Series(range(10)),
Series(range(20, 0, -2)), ]
def create_dataframes():
return ([DataFrame(),
DataFrame(columns=['a']),
DataFrame(columns=['a', 'a']),
DataFrame(columns=['a', 'b']),
DataFrame(np.arange(10).reshape((5, 2))),
DataFrame(np.arange(25).reshape((5, 5))),
DataFrame(np.arange(25).reshape((5, 5)),
columns=['a', 'b', 99, 'd', 'd'])] +
[DataFrame(s) for s in create_series()])
def is_constant(x):
values = x.values.ravel()
return len(set(values[notna(values)])) == 1
def no_nans(x):
return x.notna().all().all()
# data is a tuple(object, is_contant, no_nans)
data = create_series() + create_dataframes()
return [(x, is_constant(x), no_nans(x)) for x in data]
_consistency_data = _create_consistency_data()
def _rolling_consistency_cases():
for window in [1, 2, 3, 10, 20]:
for min_periods in set([0, 1, 2, 3, 4, window]):
if min_periods and (min_periods > window):
continue
for center in [False, True]:
yield window, min_periods, center
class TestMomentsConsistency(Base):
base_functions = [
(lambda v: Series(v).count(), None, 'count'),
(lambda v: Series(v).max(), None, 'max'),
(lambda v: Series(v).min(), None, 'min'),
(lambda v: Series(v).sum(), None, 'sum'),
(lambda v: Series(v).mean(), None, 'mean'),
(lambda v: Series(v).std(), 1, 'std'),
(lambda v: Series(v).cov(Series(v)), None, 'cov'),
(lambda v: Series(v).corr(Series(v)), None, 'corr'),
(lambda v: Series(v).var(), 1, 'var'),
# restore once GH 8086 is fixed
# lambda v: Series(v).skew(), 3, 'skew'),
# (lambda v: Series(v).kurt(), 4, 'kurt'),
# restore once GH 8084 is fixed
# lambda v: Series(v).quantile(0.3), None, 'quantile'),
(lambda v: Series(v).median(), None, 'median'),
(np.nanmax, 1, 'max'),
(np.nanmin, 1, 'min'),
(np.nansum, 1, 'sum'),
(np.nanmean, 1, 'mean'),
(lambda v: np.nanstd(v, ddof=1), 1, 'std'),
(lambda v: np.nanvar(v, ddof=1), 1, 'var'),
(np.nanmedian, 1, 'median'),
]
no_nan_functions = [
(np.max, None, 'max'),
(np.min, None, 'min'),
(np.sum, None, 'sum'),
(np.mean, None, 'mean'),
(lambda v: np.std(v, ddof=1), 1, 'std'),
(lambda v: np.var(v, ddof=1), 1, 'var'),
(np.median, None, 'median'),
]
def _create_data(self):
super(TestMomentsConsistency, self)._create_data()
self.data = _consistency_data
def setup_method(self, method):
self._create_data()
def _test_moments_consistency(self, min_periods, count, mean, mock_mean,
corr, var_unbiased=None, std_unbiased=None,
cov_unbiased=None, var_biased=None,
std_biased=None, cov_biased=None,
var_debiasing_factors=None):
def _non_null_values(x):
values = x.values.ravel()
return set(values[notna(values)].tolist())
for (x, is_constant, no_nans) in self.data:
count_x = count(x)
mean_x = mean(x)
if mock_mean:
# check that mean equals mock_mean
expected = mock_mean(x)
assert_equal(mean_x, expected.astype('float64'))
# check that correlation of a series with itself is either 1 or NaN
corr_x_x = corr(x, x)
# assert _non_null_values(corr_x_x).issubset(set([1.]))
# restore once rolling_cov(x, x) is identically equal to var(x)
if is_constant:
exp = x.max() if isinstance(x, Series) else x.max().max()
# check mean of constant series
expected = x * np.nan
expected[count_x >= max(min_periods, 1)] = exp
assert_equal(mean_x, expected)
# check correlation of constant series with itself is NaN
expected[:] = np.nan
assert_equal(corr_x_x, expected)
if var_unbiased and var_biased and var_debiasing_factors:
# check variance debiasing factors
var_unbiased_x = var_unbiased(x)
var_biased_x = var_biased(x)
var_debiasing_factors_x = var_debiasing_factors(x)
assert_equal(var_unbiased_x, var_biased_x *
var_debiasing_factors_x)
for (std, var, cov) in [(std_biased, var_biased, cov_biased),
(std_unbiased, var_unbiased, cov_unbiased)
]:
# check that var(x), std(x), and cov(x) are all >= 0
var_x = var(x)
std_x = std(x)
assert not (var_x < 0).any().any()
assert not (std_x < 0).any().any()
if cov:
cov_x_x = cov(x, x)
assert not (cov_x_x < 0).any().any()
# check that var(x) == cov(x, x)
assert_equal(var_x, cov_x_x)
# check that var(x) == std(x)^2
assert_equal(var_x, std_x * std_x)
if var is var_biased:
# check that biased var(x) == mean(x^2) - mean(x)^2
mean_x2 = mean(x * x)
assert_equal(var_x, mean_x2 - (mean_x * mean_x))
if is_constant:
# check that variance of constant series is identically 0
assert not (var_x > 0).any().any()
expected = x * np.nan
expected[count_x >= max(min_periods, 1)] = 0.
if var is var_unbiased:
expected[count_x < 2] = np.nan
assert_equal(var_x, expected)
if isinstance(x, Series):
for (y, is_constant, no_nans) in self.data:
if not x.isna().equals(y.isna()):
# can only easily test two Series with similar
# structure
continue
# check that cor(x, y) is symmetric
corr_x_y = corr(x, y)
corr_y_x = corr(y, x)
assert_equal(corr_x_y, corr_y_x)
if cov:
# check that cov(x, y) is symmetric
cov_x_y = cov(x, y)
cov_y_x = cov(y, x)
assert_equal(cov_x_y, cov_y_x)
# check that cov(x, y) == (var(x+y) - var(x) -
# var(y)) / 2
var_x_plus_y = var(x + y)
var_y = var(y)
assert_equal(cov_x_y, 0.5 *
(var_x_plus_y - var_x - var_y))
# check that corr(x, y) == cov(x, y) / (std(x) *
# std(y))
std_y = std(y)
assert_equal(corr_x_y, cov_x_y / (std_x * std_y))
if cov is cov_biased:
# check that biased cov(x, y) == mean(x*y) -
# mean(x)*mean(y)
mean_y = mean(y)
mean_x_times_y = mean(x * y)
assert_equal(cov_x_y, mean_x_times_y -
(mean_x * mean_y))
@pytest.mark.slow
@pytest.mark.parametrize(
'min_periods, adjust, ignore_na', product([0, 1, 2, 3, 4],
[True, False],
[False, True]))
def test_ewm_consistency(self, min_periods, adjust, ignore_na):
def _weights(s, com, adjust, ignore_na):
if isinstance(s, DataFrame):
if not len(s.columns):
return DataFrame(index=s.index, columns=s.columns)
w = concat([
_weights(s.iloc[:, i], com=com, adjust=adjust,
ignore_na=ignore_na)
for i, _ in enumerate(s.columns)], axis=1)
w.index = s.index
w.columns = s.columns
return w
w = Series(np.nan, index=s.index)
alpha = 1. / (1. + com)
if ignore_na:
w[s.notna()] = _weights(s[s.notna()], com=com,
adjust=adjust, ignore_na=False)
elif adjust:
for i in range(len(s)):
if s.iat[i] == s.iat[i]:
w.iat[i] = pow(1. / (1. - alpha), i)
else:
sum_wts = 0.
prev_i = -1
for i in range(len(s)):
if s.iat[i] == s.iat[i]:
if prev_i == -1:
w.iat[i] = 1.
else:
w.iat[i] = alpha * sum_wts / pow(1. - alpha,
i - prev_i)
sum_wts += w.iat[i]
prev_i = i
return w
def _variance_debiasing_factors(s, com, adjust, ignore_na):
weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na)
cum_sum = weights.cumsum().fillna(method='ffill')
cum_sum_sq = (weights * weights).cumsum().fillna(method='ffill')
numerator = cum_sum * cum_sum
denominator = numerator - cum_sum_sq
denominator[denominator <= 0.] = np.nan
return numerator / denominator
def _ewma(s, com, min_periods, adjust, ignore_na):
weights = _weights(s, com=com, adjust=adjust, ignore_na=ignore_na)
result = s.multiply(weights).cumsum().divide(weights.cumsum(
)).fillna(method='ffill')
result[s.expanding().count() < (max(min_periods, 1) if min_periods
else 1)] = np.nan
return result
com = 3.
# test consistency between different ewm* moments
self._test_moments_consistency(
min_periods=min_periods,
count=lambda x: x.expanding().count(),
mean=lambda x: x.ewm(com=com, min_periods=min_periods,
adjust=adjust,
ignore_na=ignore_na).mean(),
mock_mean=lambda x: _ewma(x, com=com,
min_periods=min_periods,
adjust=adjust,
ignore_na=ignore_na),
corr=lambda x, y: x.ewm(com=com, min_periods=min_periods,
adjust=adjust,
ignore_na=ignore_na).corr(y),
var_unbiased=lambda x: (
x.ewm(com=com, min_periods=min_periods,
adjust=adjust,
ignore_na=ignore_na).var(bias=False)),
std_unbiased=lambda x: (
x.ewm(com=com, min_periods=min_periods,
adjust=adjust, ignore_na=ignore_na)
.std(bias=False)),
cov_unbiased=lambda x, y: (
x.ewm(com=com, min_periods=min_periods,
adjust=adjust, ignore_na=ignore_na)
.cov(y, bias=False)),
var_biased=lambda x: (
x.ewm(com=com, min_periods=min_periods,
adjust=adjust, ignore_na=ignore_na)
.var(bias=True)),
std_biased=lambda x: x.ewm(com=com, min_periods=min_periods,
adjust=adjust,
ignore_na=ignore_na).std(bias=True),
cov_biased=lambda x, y: (
x.ewm(com=com, min_periods=min_periods,
adjust=adjust, ignore_na=ignore_na)
.cov(y, bias=True)),
var_debiasing_factors=lambda x: (
_variance_debiasing_factors(x, com=com, adjust=adjust,
ignore_na=ignore_na)))
@pytest.mark.slow
@pytest.mark.parametrize(
'min_periods', [0, 1, 2, 3, 4])
def test_expanding_consistency(self, min_periods):
# suppress warnings about empty slices, as we are deliberately testing
# with empty/0-length Series/DataFrames
with warnings.catch_warnings():
warnings.filterwarnings("ignore",
message=".*(empty slice|0 for slice).*",
category=RuntimeWarning)
# test consistency between different expanding_* moments
self._test_moments_consistency(
min_periods=min_periods,
count=lambda x: x.expanding().count(),
mean=lambda x: x.expanding(
min_periods=min_periods).mean(),
mock_mean=lambda x: x.expanding(
min_periods=min_periods).sum() / x.expanding().count(),
corr=lambda x, y: x.expanding(
min_periods=min_periods).corr(y),
var_unbiased=lambda x: x.expanding(
min_periods=min_periods).var(),
std_unbiased=lambda x: x.expanding(
min_periods=min_periods).std(),
cov_unbiased=lambda x, y: x.expanding(
min_periods=min_periods).cov(y),
var_biased=lambda x: x.expanding(
min_periods=min_periods).var(ddof=0),
std_biased=lambda x: x.expanding(
min_periods=min_periods).std(ddof=0),
cov_biased=lambda x, y: x.expanding(
min_periods=min_periods).cov(y, ddof=0),
var_debiasing_factors=lambda x: (
x.expanding().count() /
(x.expanding().count() - 1.)
.replace(0., np.nan)))
# test consistency between expanding_xyz() and either (a)
# expanding_apply of Series.xyz(), or (b) expanding_apply of
# np.nanxyz()
for (x, is_constant, no_nans) in self.data:
functions = self.base_functions
# GH 8269
if no_nans:
functions = self.base_functions + self.no_nan_functions
for (f, require_min_periods, name) in functions:
expanding_f = getattr(
x.expanding(min_periods=min_periods), name)
if (require_min_periods and
(min_periods is not None) and
(min_periods < require_min_periods)):
continue
if name == 'count':
expanding_f_result = expanding_f()
expanding_apply_f_result = x.expanding(
min_periods=0).apply(func=f, raw=True)
else:
if name in ['cov', 'corr']:
expanding_f_result = expanding_f(
pairwise=False)
else:
expanding_f_result = expanding_f()
expanding_apply_f_result = x.expanding(
min_periods=min_periods).apply(func=f, raw=True)
# GH 9422
if name in ['sum', 'prod']:
assert_equal(expanding_f_result,
expanding_apply_f_result)
@pytest.mark.slow
@pytest.mark.parametrize(
'window,min_periods,center', list(_rolling_consistency_cases()))
def test_rolling_consistency(self, window, min_periods, center):
# suppress warnings about empty slices, as we are deliberately testing
# with empty/0-length Series/DataFrames
with warnings.catch_warnings():
warnings.filterwarnings("ignore",
message=".*(empty slice|0 for slice).*",
category=RuntimeWarning)
# test consistency between different rolling_* moments
self._test_moments_consistency(
min_periods=min_periods,
count=lambda x: (
x.rolling(window=window, center=center)
.count()),
mean=lambda x: (
x.rolling(window=window, min_periods=min_periods,
center=center).mean()),
mock_mean=lambda x: (
x.rolling(window=window,
min_periods=min_periods,
center=center).sum()
.divide(x.rolling(window=window,
min_periods=min_periods,
center=center).count())),
corr=lambda x, y: (
x.rolling(window=window, min_periods=min_periods,
center=center).corr(y)),
var_unbiased=lambda x: (
x.rolling(window=window, min_periods=min_periods,
center=center).var()),
std_unbiased=lambda x: (
x.rolling(window=window, min_periods=min_periods,
center=center).std()),
cov_unbiased=lambda x, y: (
x.rolling(window=window, min_periods=min_periods,
center=center).cov(y)),
var_biased=lambda x: (
x.rolling(window=window, min_periods=min_periods,
center=center).var(ddof=0)),
std_biased=lambda x: (
x.rolling(window=window, min_periods=min_periods,
center=center).std(ddof=0)),
cov_biased=lambda x, y: (
x.rolling(window=window, min_periods=min_periods,
center=center).cov(y, ddof=0)),
var_debiasing_factors=lambda x: (
x.rolling(window=window, center=center).count()
.divide((x.rolling(window=window, center=center)
.count() - 1.)
.replace(0., np.nan))))
# test consistency between rolling_xyz() and either (a)
# rolling_apply of Series.xyz(), or (b) rolling_apply of
# np.nanxyz()
for (x, is_constant, no_nans) in self.data:
functions = self.base_functions
# GH 8269
if no_nans:
functions = self.base_functions + self.no_nan_functions
for (f, require_min_periods, name) in functions:
rolling_f = getattr(
x.rolling(window=window, center=center,
min_periods=min_periods), name)
if require_min_periods and (
min_periods is not None) and (
min_periods < require_min_periods):
continue
if name == 'count':
rolling_f_result = rolling_f()
rolling_apply_f_result = x.rolling(
window=window, min_periods=0,
center=center).apply(func=f, raw=True)
else:
if name in ['cov', 'corr']:
rolling_f_result = rolling_f(
pairwise=False)
else:
rolling_f_result = rolling_f()
rolling_apply_f_result = x.rolling(
window=window, min_periods=min_periods,
center=center).apply(func=f, raw=True)
# GH 9422
if name in ['sum', 'prod']:
assert_equal(rolling_f_result,
rolling_apply_f_result)
# binary moments
def test_rolling_cov(self):
A = self.series
B = A + randn(len(A))
result = A.rolling(window=50, min_periods=25).cov(B)
tm.assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1])
def test_rolling_cov_pairwise(self):
self._check_pairwise_moment('rolling', 'cov', window=10, min_periods=5)
def test_rolling_corr(self):
A = self.series
B = A + randn(len(A))
result = A.rolling(window=50, min_periods=25).corr(B)
tm.assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1])
# test for correct bias correction
a = tm.makeTimeSeries()
b = tm.makeTimeSeries()
a[:5] = np.nan
b[:10] = np.nan
result = a.rolling(window=len(a), min_periods=1).corr(b)
tm.assert_almost_equal(result[-1], a.corr(b))
def test_rolling_corr_pairwise(self):
self._check_pairwise_moment('rolling', 'corr', window=10,
min_periods=5)
@pytest.mark.parametrize('window', range(7))
def test_rolling_corr_with_zero_variance(self, window):
# GH 18430
s = pd.Series(np.zeros(20))
other = pd.Series(np.arange(20))
assert s.rolling(window=window).corr(other=other).isna().all()
def _check_pairwise_moment(self, dispatch, name, **kwargs):
def get_result(obj, obj2=None):
return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2)
result = get_result(self.frame)
result = result.loc[(slice(None), 1), 5]
result.index = result.index.droplevel(1)
expected = get_result(self.frame[1], self.frame[5])
tm.assert_series_equal(result, expected, check_names=False)
def test_flex_binary_moment(self):
# GH3155
# don't blow the stack
pytest.raises(TypeError, rwindow._flex_binary_moment, 5, 6, None)
def test_corr_sanity(self):
# GH 3155
df = DataFrame(np.array(
[[0.87024726, 0.18505595], [0.64355431, 0.3091617],
[0.92372966, 0.50552513], [0.00203756, 0.04520709],
[0.84780328, 0.33394331], [0.78369152, 0.63919667]]))
res = df[0].rolling(5, center=True).corr(df[1])
assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res)
# and some fuzzing
for _ in range(10):
df = DataFrame(np.random.rand(30, 2))
res = df[0].rolling(5, center=True).corr(df[1])
try:
assert all(np.abs(np.nan_to_num(x)) <= 1 for x in res)
except AssertionError:
print(res)
@pytest.mark.parametrize('method', ['corr', 'cov'])
def test_flex_binary_frame(self, method):
series = self.frame[1]
res = getattr(series.rolling(window=10), method)(self.frame)
res2 = getattr(self.frame.rolling(window=10), method)(series)
exp = self.frame.apply(lambda x: getattr(
series.rolling(window=10), method)(x))
tm.assert_frame_equal(res, exp)
tm.assert_frame_equal(res2, exp)
frame2 = self.frame.copy()
frame2.values[:] = np.random.randn(*frame2.shape)
res3 = getattr(self.frame.rolling(window=10), method)(frame2)
exp = DataFrame(dict((k, getattr(self.frame[k].rolling(
window=10), method)(frame2[k])) for k in self.frame))
tm.assert_frame_equal(res3, exp)
def test_ewmcov(self):
self._check_binary_ew('cov')
def test_ewmcov_pairwise(self):
self._check_pairwise_moment('ewm', 'cov', span=10, min_periods=5)
def test_ewmcorr(self):
self._check_binary_ew('corr')
def test_ewmcorr_pairwise(self):
self._check_pairwise_moment('ewm', 'corr', span=10, min_periods=5)
def _check_binary_ew(self, name):
def func(A, B, com, **kwargs):
return getattr(A.ewm(com, **kwargs), name)(B)
A = Series(randn(50), index=np.arange(50))
B = A[2:] + randn(48)
A[:10] = np.NaN
B[-10:] = np.NaN
result = func(A, B, 20, min_periods=5)
assert np.isnan(result.values[:14]).all()
assert not np.isnan(result.values[14:]).any()
# GH 7898
for min_periods in (0, 1, 2):
result = func(A, B, 20, min_periods=min_periods)
# binary functions (ewmcov, ewmcorr) with bias=False require at
# least two values
assert np.isnan(result.values[:11]).all()
assert not np.isnan(result.values[11:]).any()
# check series of length 0
result = func(Series([]), Series([]), 50, min_periods=min_periods)
tm.assert_series_equal(result, Series([]))
# check series of length 1
result = func(
Series([1.]), Series([1.]), 50, min_periods=min_periods)
tm.assert_series_equal(result, Series([np.NaN]))
pytest.raises(Exception, func, A, randn(50), 20, min_periods=5)
def test_expanding_apply_args_kwargs(self, raw):
def mean_w_arg(x, const):
return np.mean(x) + const
df = DataFrame(np.random.rand(20, 3))
expected = df.expanding().apply(np.mean, raw=raw) + 20.
result = df.expanding().apply(mean_w_arg,
raw=raw,
args=(20, ))
tm.assert_frame_equal(result, expected)
result = df.expanding().apply(mean_w_arg,
raw=raw,
kwargs={'const': 20})
tm.assert_frame_equal(result, expected)
def test_expanding_corr(self):
A = self.series.dropna()
B = (A + randn(len(A)))[:-5]
result = A.expanding().corr(B)
rolling_result = A.rolling(window=len(A), min_periods=1).corr(B)
tm.assert_almost_equal(rolling_result, result)
def test_expanding_count(self):
result = self.series.expanding().count()
tm.assert_almost_equal(result, self.series.rolling(
window=len(self.series)).count())
def test_expanding_quantile(self):
result = self.series.expanding().quantile(0.5)
rolling_result = self.series.rolling(window=len(self.series),
min_periods=1).quantile(0.5)
tm.assert_almost_equal(result, rolling_result)
def test_expanding_cov(self):
A = self.series
B = (A + randn(len(A)))[:-5]
result = A.expanding().cov(B)
rolling_result = A.rolling(window=len(A), min_periods=1).cov(B)
tm.assert_almost_equal(rolling_result, result)
def test_expanding_cov_pairwise(self):
result = self.frame.expanding().corr()
rolling_result = self.frame.rolling(window=len(self.frame),
min_periods=1).corr()
tm.assert_frame_equal(result, rolling_result)
def test_expanding_corr_pairwise(self):
result = self.frame.expanding().corr()
rolling_result = self.frame.rolling(window=len(self.frame),
min_periods=1).corr()
tm.assert_frame_equal(result, rolling_result)
def test_expanding_cov_diff_index(self):
# GH 7512
s1 = Series([1, 2, 3], index=[0, 1, 2])
s2 = Series([1, 3], index=[0, 2])
result = s1.expanding().cov(s2)
expected = Series([None, None, 2.0])
tm.assert_series_equal(result, expected)
s2a = Series([1, None, 3], index=[0, 1, 2])
result = s1.expanding().cov(s2a)
tm.assert_series_equal(result, expected)
s1 = Series([7, 8, 10], index=[0, 1, 3])
s2 = Series([7, 9, 10], index=[0, 2, 3])
result = s1.expanding().cov(s2)
expected = Series([None, None, None, 4.5])
tm.assert_series_equal(result, expected)
def test_expanding_corr_diff_index(self):
# GH 7512
s1 = Series([1, 2, 3], index=[0, 1, 2])
s2 = Series([1, 3], index=[0, 2])
result = s1.expanding().corr(s2)
expected = Series([None, None, 1.0])
tm.assert_series_equal(result, expected)
s2a = Series([1, None, 3], index=[0, 1, 2])
result = s1.expanding().corr(s2a)
tm.assert_series_equal(result, expected)
s1 = Series([7, 8, 10], index=[0, 1, 3])
s2 = Series([7, 9, 10], index=[0, 2, 3])
result = s1.expanding().corr(s2)
expected = Series([None, None, None, 1.])
tm.assert_series_equal(result, expected)
def test_rolling_cov_diff_length(self):
# GH 7512
s1 = Series([1, 2, 3], index=[0, 1, 2])
s2 = Series([1, 3], index=[0, 2])
result = s1.rolling(window=3, min_periods=2).cov(s2)
expected = Series([None, None, 2.0])
tm.assert_series_equal(result, expected)
s2a = Series([1, None, 3], index=[0, 1, 2])
result = s1.rolling(window=3, min_periods=2).cov(s2a)
tm.assert_series_equal(result, expected)
def test_rolling_corr_diff_length(self):
# GH 7512
s1 = Series([1, 2, 3], index=[0, 1, 2])
s2 = Series([1, 3], index=[0, 2])
result = s1.rolling(window=3, min_periods=2).corr(s2)
expected = Series([None, None, 1.0])
tm.assert_series_equal(result, expected)
s2a = Series([1, None, 3], index=[0, 1, 2])
result = s1.rolling(window=3, min_periods=2).corr(s2a)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
'f',
[
lambda x: (x.rolling(window=10, min_periods=5)
.cov(x, pairwise=False)),
lambda x: (x.rolling(window=10, min_periods=5)
.corr(x, pairwise=False)),
lambda x: x.rolling(window=10, min_periods=5).max(),
lambda x: x.rolling(window=10, min_periods=5).min(),
lambda x: x.rolling(window=10, min_periods=5).sum(),
lambda x: x.rolling(window=10, min_periods=5).mean(),
lambda x: x.rolling(window=10, min_periods=5).std(),
lambda x: x.rolling(window=10, min_periods=5).var(),
lambda x: x.rolling(window=10, min_periods=5).skew(),
lambda x: x.rolling(window=10, min_periods=5).kurt(),
lambda x: x.rolling(
window=10, min_periods=5).quantile(quantile=0.5),
lambda x: x.rolling(window=10, min_periods=5).median(),
lambda x: x.rolling(window=10, min_periods=5).apply(
sum, raw=False),
lambda x: x.rolling(window=10, min_periods=5).apply(
sum, raw=True),
lambda x: x.rolling(win_type='boxcar',
window=10, min_periods=5).mean()])
def test_rolling_functions_window_non_shrinkage(self, f):
# GH 7764
s = Series(range(4))
s_expected = Series(np.nan, index=s.index)
df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=['A', 'B'])
df_expected = DataFrame(np.nan, index=df.index, columns=df.columns)
try:
s_result = f(s)
tm.assert_series_equal(s_result, s_expected)
df_result = f(df)
tm.assert_frame_equal(df_result, df_expected)
except (ImportError):
# scipy needed for rolling_window
pytest.skip("scipy not available")
def test_rolling_functions_window_non_shrinkage_binary(self):
# corr/cov return a MI DataFrame
df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]],
columns=Index(['A', 'B'], name='foo'),
index=Index(range(4), name='bar'))
df_expected = DataFrame(
columns=Index(['A', 'B'], name='foo'),
index=pd.MultiIndex.from_product([df.index, df.columns],
names=['bar', 'foo']),
dtype='float64')
functions = [lambda x: (x.rolling(window=10, min_periods=5)
.cov(x, pairwise=True)),
lambda x: (x.rolling(window=10, min_periods=5)
.corr(x, pairwise=True))]
for f in functions:
df_result = f(df)
tm.assert_frame_equal(df_result, df_expected)
def test_moment_functions_zero_length(self):
# GH 8056
s = Series()
s_expected = s
df1 = DataFrame()
df1_expected = df1
df2 = DataFrame(columns=['a'])
df2['a'] = df2['a'].astype('float64')
df2_expected = df2
functions = [lambda x: x.expanding().count(),
lambda x: x.expanding(min_periods=5).cov(
x, pairwise=False),
lambda x: x.expanding(min_periods=5).corr(
x, pairwise=False),
lambda x: x.expanding(min_periods=5).max(),
lambda x: x.expanding(min_periods=5).min(),
lambda x: x.expanding(min_periods=5).sum(),
lambda x: x.expanding(min_periods=5).mean(),
lambda x: x.expanding(min_periods=5).std(),
lambda x: x.expanding(min_periods=5).var(),
lambda x: x.expanding(min_periods=5).skew(),
lambda x: x.expanding(min_periods=5).kurt(),
lambda x: x.expanding(min_periods=5).quantile(0.5),
lambda x: x.expanding(min_periods=5).median(),
lambda x: x.expanding(min_periods=5).apply(
sum, raw=False),
lambda x: x.expanding(min_periods=5).apply(
sum, raw=True),
lambda x: x.rolling(window=10).count(),
lambda x: x.rolling(window=10, min_periods=5).cov(
x, pairwise=False),
lambda x: x.rolling(window=10, min_periods=5).corr(
x, pairwise=False),
lambda x: x.rolling(window=10, min_periods=5).max(),
lambda x: x.rolling(window=10, min_periods=5).min(),
lambda x: x.rolling(window=10, min_periods=5).sum(),
lambda x: x.rolling(window=10, min_periods=5).mean(),
lambda x: x.rolling(window=10, min_periods=5).std(),
lambda x: x.rolling(window=10, min_periods=5).var(),
lambda x: x.rolling(window=10, min_periods=5).skew(),
lambda x: x.rolling(window=10, min_periods=5).kurt(),
lambda x: x.rolling(
window=10, min_periods=5).quantile(0.5),
lambda x: x.rolling(window=10, min_periods=5).median(),
lambda x: x.rolling(window=10, min_periods=5).apply(
sum, raw=False),
lambda x: x.rolling(window=10, min_periods=5).apply(
sum, raw=True),
lambda x: x.rolling(win_type='boxcar',
window=10, min_periods=5).mean(),
]
for f in functions:
try:
s_result = f(s)
tm.assert_series_equal(s_result, s_expected)
df1_result = f(df1)
tm.assert_frame_equal(df1_result, df1_expected)
df2_result = f(df2)
tm.assert_frame_equal(df2_result, df2_expected)
except (ImportError):
# scipy needed for rolling_window
continue
def test_moment_functions_zero_length_pairwise(self):
df1 = DataFrame()
df1_expected = df1
df2 = DataFrame(columns=Index(['a'], name='foo'),
index=Index([], name='bar'))
df2['a'] = df2['a'].astype('float64')
df1_expected = DataFrame(
index=pd.MultiIndex.from_product([df1.index, df1.columns]),
columns=Index([]))
df2_expected = DataFrame(
index=pd.MultiIndex.from_product([df2.index, df2.columns],
names=['bar', 'foo']),
columns=Index(['a'], name='foo'),
dtype='float64')
functions = [lambda x: (x.expanding(min_periods=5)
.cov(x, pairwise=True)),
lambda x: (x.expanding(min_periods=5)
.corr(x, pairwise=True)),
lambda x: (x.rolling(window=10, min_periods=5)
.cov(x, pairwise=True)),
lambda x: (x.rolling(window=10, min_periods=5)
.corr(x, pairwise=True)),
]
for f in functions:
df1_result = f(df1)
tm.assert_frame_equal(df1_result, df1_expected)
df2_result = f(df2)
tm.assert_frame_equal(df2_result, df2_expected)
def test_expanding_cov_pairwise_diff_length(self):
# GH 7512
df1 = DataFrame([[1, 5], [3, 2], [3, 9]],
columns=Index(['A', 'B'], name='foo'))
df1a = DataFrame([[1, 5], [3, 9]],
index=[0, 2],
columns=Index(['A', 'B'], name='foo'))
df2 = DataFrame([[5, 6], [None, None], [2, 1]],
columns=Index(['X', 'Y'], name='foo'))
df2a = DataFrame([[5, 6], [2, 1]],
index=[0, 2],
columns=Index(['X', 'Y'], name='foo'))
# TODO: xref gh-15826
# .loc is not preserving the names
result1 = df1.expanding().cov(df2a, pairwise=True).loc[2]
result2 = df1.expanding().cov(df2a, pairwise=True).loc[2]
result3 = df1a.expanding().cov(df2, pairwise=True).loc[2]
result4 = df1a.expanding().cov(df2a, pairwise=True).loc[2]
expected = DataFrame([[-3.0, -6.0], [-5.0, -10.0]],
columns=Index(['A', 'B'], name='foo'),
index=Index(['X', 'Y'], name='foo'))
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)
tm.assert_frame_equal(result3, expected)
tm.assert_frame_equal(result4, expected)
def test_expanding_corr_pairwise_diff_length(self):
# GH 7512
df1 = DataFrame([[1, 2], [3, 2], [3, 4]],
columns=['A', 'B'],
index=Index(range(3), name='bar'))
df1a = DataFrame([[1, 2], [3, 4]],
index=Index([0, 2], name='bar'),
columns=['A', 'B'])
df2 = DataFrame([[5, 6], [None, None], [2, 1]],
columns=['X', 'Y'],
index=Index(range(3), name='bar'))
df2a = DataFrame([[5, 6], [2, 1]],
index=Index([0, 2], name='bar'),
columns=['X', 'Y'])
result1 = df1.expanding().corr(df2, pairwise=True).loc[2]
result2 = df1.expanding().corr(df2a, pairwise=True).loc[2]
result3 = df1a.expanding().corr(df2, pairwise=True).loc[2]
result4 = df1a.expanding().corr(df2a, pairwise=True).loc[2]
expected = DataFrame([[-1.0, -1.0], [-1.0, -1.0]],
columns=['A', 'B'],
index=Index(['X', 'Y']))
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)
tm.assert_frame_equal(result3, expected)
tm.assert_frame_equal(result4, expected)
def test_rolling_skew_edge_cases(self):
all_nan = Series([np.NaN] * 5)
# yields all NaN (0 variance)
d = Series([1] * 5)
x = d.rolling(window=5).skew()
tm.assert_series_equal(all_nan, x)
# yields all NaN (window too small)
d = Series(np.random.randn(5))
x = d.rolling(window=2).skew()
tm.assert_series_equal(all_nan, x)
# yields [NaN, NaN, NaN, 0.177994, 1.548824]
d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401
])
expected = Series([np.NaN, np.NaN, np.NaN, 0.177994, 1.548824])
x = d.rolling(window=4).skew()
tm.assert_series_equal(expected, x)
def test_rolling_kurt_edge_cases(self):
all_nan = Series([np.NaN] * 5)
# yields all NaN (0 variance)
d = Series([1] * 5)
x = d.rolling(window=5).kurt()
tm.assert_series_equal(all_nan, x)
# yields all NaN (window too small)
d = Series(np.random.randn(5))
x = d.rolling(window=3).kurt()
tm.assert_series_equal(all_nan, x)
# yields [NaN, NaN, NaN, 1.224307, 2.671499]
d = Series([-1.50837035, -0.1297039, 0.19501095, 1.73508164, 0.41941401
])
expected = Series([np.NaN, np.NaN, np.NaN, 1.224307, 2.671499])
x = d.rolling(window=4).kurt()
tm.assert_series_equal(expected, x)
def test_rolling_skew_eq_value_fperr(self):
# #18804 all rolling skew for all equal values should return Nan
a = Series([1.1] * 15).rolling(window=10).skew()
assert np.isnan(a).all()
def test_rolling_kurt_eq_value_fperr(self):
# #18804 all rolling kurt for all equal values should return Nan
a = Series([1.1] * 15).rolling(window=10).kurt()
assert np.isnan(a).all()
@pytest.mark.parametrize('func,static_comp', [('sum', np.sum),
('mean', np.mean),
('max', np.max),
('min', np.min)],
ids=['sum', 'mean', 'max', 'min'])
def test_expanding_func(self, func, static_comp):
def expanding_func(x, min_periods=1, center=False, axis=0):
exp = x.expanding(min_periods=min_periods,
center=center, axis=axis)
return getattr(exp, func)()
self._check_expanding(expanding_func, static_comp, preserve_nan=False)
def test_expanding_apply(self, raw):
def expanding_mean(x, min_periods=1):
exp = x.expanding(min_periods=min_periods)
result = exp.apply(lambda x: x.mean(), raw=raw)
return result
# TODO(jreback), needed to add preserve_nan=False
# here to make this pass
self._check_expanding(expanding_mean, np.mean, preserve_nan=False)
ser = Series([])
tm.assert_series_equal(ser, ser.expanding().apply(
lambda x: x.mean(), raw=raw))
# GH 8080
s = Series([None, None, None])
result = s.expanding(min_periods=0).apply(lambda x: len(x), raw=raw)
expected = Series([1., 2., 3.])
tm.assert_series_equal(result, expected)
def _check_expanding(self, func, static_comp, has_min_periods=True,
has_time_rule=True, preserve_nan=True):
series_result = func(self.series)
assert isinstance(series_result, Series)
frame_result = func(self.frame)
assert isinstance(frame_result, DataFrame)
result = func(self.series)
tm.assert_almost_equal(result[10], static_comp(self.series[:11]))
if preserve_nan:
assert result.iloc[self._nan_locs].isna().all()
ser = Series(randn(50))
if has_min_periods:
result = func(ser, min_periods=30)
assert result[:29].isna().all()
tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50]))
# min_periods is working correctly
result = func(ser, min_periods=15)
assert isna(result.iloc[13])
assert notna(result.iloc[14])
ser2 = Series(randn(20))
result = func(ser2, min_periods=5)
assert isna(result[3])
assert notna(result[4])
# min_periods=0
result0 = func(ser, min_periods=0)
result1 = func(ser, min_periods=1)
tm.assert_almost_equal(result0, result1)
else:
result = func(ser)
tm.assert_almost_equal(result.iloc[-1], static_comp(ser[:50]))
def test_rolling_max_gh6297(self):
"""Replicate result expected in GH #6297"""
indices = [datetime(1975, 1, i) for i in range(1, 6)]
# So that we can have 2 datapoints on one of the days
indices.append(datetime(1975, 1, 3, 6, 0))
series = Series(range(1, 7), index=indices)
# Use floats instead of ints as values
series = series.map(lambda x: float(x))
# Sort chronologically
series = series.sort_index()
expected = Series([1.0, 2.0, 6.0, 4.0, 5.0],
index=[datetime(1975, 1, i, 0) for i in range(1, 6)])
x = series.resample('D').max().rolling(window=1).max()
tm.assert_series_equal(expected, x)
def test_rolling_max_resample(self):
indices = [datetime(1975, 1, i) for i in range(1, 6)]
# So that we can have 3 datapoints on last day (4, 10, and 20)
indices.append(datetime(1975, 1, 5, 1))
indices.append(datetime(1975, 1, 5, 2))
series = Series(list(range(0, 5)) + [10, 20], index=indices)
# Use floats instead of ints as values
series = series.map(lambda x: float(x))
# Sort chronologically
series = series.sort_index()
# Default how should be max
expected = Series([0.0, 1.0, 2.0, 3.0, 20.0],
index=[datetime(1975, 1, i, 0) for i in range(1, 6)])
x = series.resample('D').max().rolling(window=1).max()
tm.assert_series_equal(expected, x)
# Now specify median (10.0)
expected = Series([0.0, 1.0, 2.0, 3.0, 10.0],
index=[datetime(1975, 1, i, 0) for i in range(1, 6)])
x = series.resample('D').median().rolling(window=1).max()
tm.assert_series_equal(expected, x)
# Now specify mean (4+10+20)/3
v = (4.0 + 10.0 + 20.0) / 3.0
expected = Series([0.0, 1.0, 2.0, 3.0, v],
index=[datetime(1975, 1, i, 0) for i in range(1, 6)])
x = series.resample('D').mean().rolling(window=1).max()
tm.assert_series_equal(expected, x)
def test_rolling_min_resample(self):
indices = [datetime(1975, 1, i) for i in range(1, 6)]
# So that we can have 3 datapoints on last day (4, 10, and 20)
indices.append(datetime(1975, 1, 5, 1))
indices.append(datetime(1975, 1, 5, 2))
series = Series(list(range(0, 5)) + [10, 20], index=indices)
# Use floats instead of ints as values
series = series.map(lambda x: float(x))
# Sort chronologically
series = series.sort_index()
# Default how should be min
expected = Series([0.0, 1.0, 2.0, 3.0, 4.0],
index=[datetime(1975, 1, i, 0) for i in range(1, 6)])
r = series.resample('D').min().rolling(window=1)
tm.assert_series_equal(expected, r.min())
def test_rolling_median_resample(self):
indices = [datetime(1975, 1, i) for i in range(1, 6)]
# So that we can have 3 datapoints on last day (4, 10, and 20)
indices.append(datetime(1975, 1, 5, 1))
indices.append(datetime(1975, 1, 5, 2))
series = Series(list(range(0, 5)) + [10, 20], index=indices)
# Use floats instead of ints as values
series = series.map(lambda x: float(x))
# Sort chronologically
series = series.sort_index()
# Default how should be median
expected = Series([0.0, 1.0, 2.0, 3.0, 10],
index=[datetime(1975, 1, i, 0) for i in range(1, 6)])
x = series.resample('D').median().rolling(window=1).median()
tm.assert_series_equal(expected, x)
def test_rolling_median_memory_error(self):
# GH11722
n = 20000
Series(np.random.randn(n)).rolling(window=2, center=False).median()
Series(np.random.randn(n)).rolling(window=2, center=False).median()
def test_rolling_min_max_numeric_types(self):
# GH12373
types_test = [np.dtype("f{}".format(width)) for width in [4, 8]]
types_test.extend([np.dtype("{}{}".format(sign, width))
for width in [1, 2, 4, 8] for sign in "ui"])
for data_type in types_test:
# Just testing that these don't throw exceptions and that
# the return type is float64. Other tests will cover quantitative
# correctness
result = (DataFrame(np.arange(20, dtype=data_type))
.rolling(window=5).max())
assert result.dtypes[0] == np.dtype("f8")
result = (DataFrame(np.arange(20, dtype=data_type))
.rolling(window=5).min())
assert result.dtypes[0] == np.dtype("f8")
class TestGrouperGrouping(object):
def setup_method(self, method):
self.series = Series(np.arange(10))
self.frame = DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8,
'B': np.arange(40)})
def test_mutated(self):
def f():
self.frame.groupby('A', foo=1)
pytest.raises(TypeError, f)
g = self.frame.groupby('A')
assert not g.mutated
g = self.frame.groupby('A', mutated=True)
assert g.mutated
def test_getitem(self):
g = self.frame.groupby('A')
g_mutated = self.frame.groupby('A', mutated=True)
expected = g_mutated.B.apply(lambda x: x.rolling(2).mean())
result = g.rolling(2).mean().B
tm.assert_series_equal(result, expected)
result = g.rolling(2).B.mean()
tm.assert_series_equal(result, expected)
result = g.B.rolling(2).mean()
tm.assert_series_equal(result, expected)
result = self.frame.B.groupby(self.frame.A).rolling(2).mean()
tm.assert_series_equal(result, expected)
def test_getitem_multiple(self):
# GH 13174
g = self.frame.groupby('A')
r = g.rolling(2)
g_mutated = self.frame.groupby('A', mutated=True)
expected = g_mutated.B.apply(lambda x: x.rolling(2).count())
result = r.B.count()
tm.assert_series_equal(result, expected)
result = r.B.count()
tm.assert_series_equal(result, expected)
def test_rolling(self):
g = self.frame.groupby('A')
r = g.rolling(window=4)
for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']:
result = getattr(r, f)()
expected = g.apply(lambda x: getattr(x.rolling(4), f)())
tm.assert_frame_equal(result, expected)
for f in ['std', 'var']:
result = getattr(r, f)(ddof=1)
expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1))
tm.assert_frame_equal(result, expected)
result = r.quantile(0.5)
expected = g.apply(lambda x: x.rolling(4).quantile(0.5))
tm.assert_frame_equal(result, expected)
def test_rolling_corr_cov(self):
g = self.frame.groupby('A')
r = g.rolling(window=4)
for f in ['corr', 'cov']:
result = getattr(r, f)(self.frame)
def func(x):
return getattr(x.rolling(4), f)(self.frame)
expected = g.apply(func)
tm.assert_frame_equal(result, expected)
result = getattr(r.B, f)(pairwise=True)
def func(x):
return getattr(x.B.rolling(4), f)(pairwise=True)
expected = g.apply(func)
tm.assert_series_equal(result, expected)
def test_rolling_apply(self, raw):
g = self.frame.groupby('A')
r = g.rolling(window=4)
# reduction
result = r.apply(lambda x: x.sum(), raw=raw)
expected = g.apply(
lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw))
tm.assert_frame_equal(result, expected)
def test_rolling_apply_mutability(self):
# GH 14013
df = pd.DataFrame({'A': ['foo'] * 3 + ['bar'] * 3, 'B': [1] * 6})
g = df.groupby('A')
mi = pd.MultiIndex.from_tuples([('bar', 3), ('bar', 4), ('bar', 5),
('foo', 0), ('foo', 1), ('foo', 2)])
mi.names = ['A', None]
# Grouped column should not be a part of the output
expected = pd.DataFrame([np.nan, 2., 2.] * 2, columns=['B'], index=mi)
result = g.rolling(window=2).sum()
tm.assert_frame_equal(result, expected)
# Call an arbitrary function on the groupby
g.sum()
# Make sure nothing has been mutated
result = g.rolling(window=2).sum()
tm.assert_frame_equal(result, expected)
def test_expanding(self):
g = self.frame.groupby('A')
r = g.expanding()
for f in ['sum', 'mean', 'min', 'max', 'count', 'kurt', 'skew']:
result = getattr(r, f)()
expected = g.apply(lambda x: getattr(x.expanding(), f)())
tm.assert_frame_equal(result, expected)
for f in ['std', 'var']:
result = getattr(r, f)(ddof=0)
expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0))
tm.assert_frame_equal(result, expected)
result = r.quantile(0.5)
expected = g.apply(lambda x: x.expanding().quantile(0.5))
tm.assert_frame_equal(result, expected)
def test_expanding_corr_cov(self):
g = self.frame.groupby('A')
r = g.expanding()
for f in ['corr', 'cov']:
result = getattr(r, f)(self.frame)
def func(x):
return getattr(x.expanding(), f)(self.frame)
expected = g.apply(func)
tm.assert_frame_equal(result, expected)
result = getattr(r.B, f)(pairwise=True)
def func(x):
return getattr(x.B.expanding(), f)(pairwise=True)
expected = g.apply(func)
tm.assert_series_equal(result, expected)
def test_expanding_apply(self, raw):
g = self.frame.groupby('A')
r = g.expanding()
# reduction
result = r.apply(lambda x: x.sum(), raw=raw)
expected = g.apply(
lambda x: x.expanding().apply(lambda y: y.sum(), raw=raw))
tm.assert_frame_equal(result, expected)
class TestRollingTS(object):
# rolling time-series friendly
# xref GH13327
def setup_method(self, method):
self.regular = DataFrame({'A': pd.date_range('20130101',
periods=5,
freq='s'),
'B': range(5)}).set_index('A')
self.ragged = DataFrame({'B': range(5)})
self.ragged.index = [Timestamp('20130101 09:00:00'),
Timestamp('20130101 09:00:02'),
Timestamp('20130101 09:00:03'),
Timestamp('20130101 09:00:05'),
Timestamp('20130101 09:00:06')]
def test_doc_string(self):
df = DataFrame({'B': [0, 1, 2, np.nan, 4]},
index=[Timestamp('20130101 09:00:00'),
Timestamp('20130101 09:00:02'),
Timestamp('20130101 09:00:03'),
Timestamp('20130101 09:00:05'),
Timestamp('20130101 09:00:06')])
df
df.rolling('2s').sum()
def test_valid(self):
df = self.regular
# not a valid freq
with pytest.raises(ValueError):
df.rolling(window='foobar')
# not a datetimelike index
with pytest.raises(ValueError):
df.reset_index().rolling(window='foobar')
# non-fixed freqs
for freq in ['2MS', pd.offsets.MonthBegin(2)]:
with pytest.raises(ValueError):
df.rolling(window=freq)
for freq in ['1D', pd.offsets.Day(2), '2ms']:
df.rolling(window=freq)
# non-integer min_periods
for minp in [1.0, 'foo', np.array([1, 2, 3])]:
with pytest.raises(ValueError):
df.rolling(window='1D', min_periods=minp)
# center is not implemented
with pytest.raises(NotImplementedError):
df.rolling(window='1D', center=True)
def test_on(self):
df = self.regular
# not a valid column
with pytest.raises(ValueError):
df.rolling(window='2s', on='foobar')
# column is valid
df = df.copy()
df['C'] = pd.date_range('20130101', periods=len(df))
df.rolling(window='2d', on='C').sum()
# invalid columns
with pytest.raises(ValueError):
df.rolling(window='2d', on='B')
# ok even though on non-selected
df.rolling(window='2d', on='C').B.sum()
def test_monotonic_on(self):
# on/index must be monotonic
df = DataFrame({'A': pd.date_range('20130101',
periods=5,
freq='s'),
'B': range(5)})
assert df.A.is_monotonic
df.rolling('2s', on='A').sum()
df = df.set_index('A')
assert df.index.is_monotonic
df.rolling('2s').sum()
# non-monotonic
df.index = reversed(df.index.tolist())
assert not df.index.is_monotonic
with pytest.raises(ValueError):
df.rolling('2s').sum()
df = df.reset_index()
with pytest.raises(ValueError):
df.rolling('2s', on='A').sum()
def test_frame_on(self):
df = DataFrame({'B': range(5),
'C': pd.date_range('20130101 09:00:00',
periods=5,
freq='3s')})
df['A'] = [Timestamp('20130101 09:00:00'),
Timestamp('20130101 09:00:02'),
Timestamp('20130101 09:00:03'),
Timestamp('20130101 09:00:05'),
Timestamp('20130101 09:00:06')]
# we are doing simulating using 'on'
expected = (df.set_index('A')
.rolling('2s')
.B
.sum()
.reset_index(drop=True)
)
result = (df.rolling('2s', on='A')
.B
.sum()
)
tm.assert_series_equal(result, expected)
# test as a frame
# we should be ignoring the 'on' as an aggregation column
# note that the expected is setting, computing, and resetting
# so the columns need to be switched compared
# to the actual result where they are ordered as in the
# original
expected = (df.set_index('A')
.rolling('2s')[['B']]
.sum()
.reset_index()[['B', 'A']]
)
result = (df.rolling('2s', on='A')[['B']]
.sum()
)
tm.assert_frame_equal(result, expected)
def test_frame_on2(self):
# using multiple aggregation columns
df = DataFrame({'A': [0, 1, 2, 3, 4],
'B': [0, 1, 2, np.nan, 4],
'C': Index([Timestamp('20130101 09:00:00'),
Timestamp('20130101 09:00:02'),
Timestamp('20130101 09:00:03'),
Timestamp('20130101 09:00:05'),
Timestamp('20130101 09:00:06')])},
columns=['A', 'C', 'B'])
expected1 = DataFrame({'A': [0., 1, 3, 3, 7],
'B': [0, 1, 3, np.nan, 4],
'C': df['C']},
columns=['A', 'C', 'B'])
result = df.rolling('2s', on='C').sum()
expected = expected1
tm.assert_frame_equal(result, expected)
expected = Series([0, 1, 3, np.nan, 4], name='B')
result = df.rolling('2s', on='C').B.sum()
tm.assert_series_equal(result, expected)
expected = expected1[['A', 'B', 'C']]
result = df.rolling('2s', on='C')[['A', 'B', 'C']].sum()
tm.assert_frame_equal(result, expected)
def test_basic_regular(self):
df = self.regular.copy()
df.index = pd.date_range('20130101', periods=5, freq='D')
expected = df.rolling(window=1, min_periods=1).sum()
result = df.rolling(window='1D').sum()
tm.assert_frame_equal(result, expected)
df.index = pd.date_range('20130101', periods=5, freq='2D')
expected = df.rolling(window=1, min_periods=1).sum()
result = df.rolling(window='2D', min_periods=1).sum()
tm.assert_frame_equal(result, expected)
expected = df.rolling(window=1, min_periods=1).sum()
result = df.rolling(window='2D', min_periods=1).sum()
tm.assert_frame_equal(result, expected)
expected = df.rolling(window=1).sum()
result = df.rolling(window='2D').sum()
tm.assert_frame_equal(result, expected)
def test_min_periods(self):
# compare for min_periods
df = self.regular
# these slightly different
expected = df.rolling(2, min_periods=1).sum()
result = df.rolling('2s').sum()
tm.assert_frame_equal(result, expected)
expected = df.rolling(2, min_periods=1).sum()
result = df.rolling('2s', min_periods=1).sum()
tm.assert_frame_equal(result, expected)
def test_closed(self):
# xref GH13965
df = DataFrame({'A': [1] * 5},
index=[Timestamp('20130101 09:00:01'),
Timestamp('20130101 09:00:02'),
Timestamp('20130101 09:00:03'),
Timestamp('20130101 09:00:04'),
Timestamp('20130101 09:00:06')])
# closed must be 'right', 'left', 'both', 'neither'
with pytest.raises(ValueError):
self.regular.rolling(window='2s', closed="blabla")
expected = df.copy()
expected["A"] = [1.0, 2, 2, 2, 1]
result = df.rolling('2s', closed='right').sum()
tm.assert_frame_equal(result, expected)
# default should be 'right'
result = df.rolling('2s').sum()
tm.assert_frame_equal(result, expected)
expected = df.copy()
expected["A"] = [1.0, 2, 3, 3, 2]
result = df.rolling('2s', closed='both').sum()
tm.assert_frame_equal(result, expected)
expected = df.copy()
expected["A"] = [np.nan, 1.0, 2, 2, 1]
result = df.rolling('2s', closed='left').sum()
tm.assert_frame_equal(result, expected)
expected = df.copy()
expected["A"] = [np.nan, 1.0, 1, 1, np.nan]
result = df.rolling('2s', closed='neither').sum()
tm.assert_frame_equal(result, expected)
def test_ragged_sum(self):
df = self.ragged
result = df.rolling(window='1s', min_periods=1).sum()
expected = df.copy()
expected['B'] = [0.0, 1, 2, 3, 4]
tm.assert_frame_equal(result, expected)
result = df.rolling(window='2s', min_periods=1).sum()
expected = df.copy()
expected['B'] = [0.0, 1, 3, 3, 7]
tm.assert_frame_equal(result, expected)
result = df.rolling(window='2s', min_periods=2).sum()
expected = df.copy()
expected['B'] = [np.nan, np.nan, 3, np.nan, 7]
tm.assert_frame_equal(result, expected)
result = df.rolling(window='3s', min_periods=1).sum()
expected = df.copy()
expected['B'] = [0.0, 1, 3, 5, 7]
tm.assert_frame_equal(result, expected)
result = df.rolling(window='3s').sum()
expected = df.copy()
expected['B'] = [0.0, 1, 3, 5, 7]
tm.assert_frame_equal(result, expected)
result = df.rolling(window='4s', min_periods=1).sum()
expected = df.copy()
expected['B'] = [0.0, 1, 3, 6, 9]
tm.assert_frame_equal(result, expected)
result = df.rolling(window='4s', min_periods=3).sum()
expected = df.copy()
expected['B'] = [np.nan, np.nan, 3, 6, 9]
tm.assert_frame_equal(result, expected)
result = df.rolling(window='5s', min_periods=1).sum()
expected = df.copy()
expected['B'] = [0.0, 1, 3, 6, 10]
tm.assert_frame_equal(result, expected)
def test_ragged_mean(self):
df = self.ragged
result = df.rolling(window='1s', min_periods=1).mean()
expected = df.copy()
expected['B'] = [0.0, 1, 2, 3, 4]
tm.assert_frame_equal(result, expected)
result = df.rolling(window='2s', min_periods=1).mean()
expected = df.copy()
expected['B'] = [0.0, 1, 1.5, 3.0, 3.5]
tm.assert_frame_equal(result, expected)
def test_ragged_median(self):
df = self.ragged
result = df.rolling(window='1s', min_periods=1).median()
expected = df.copy()
expected['B'] = [0.0, 1, 2, 3, 4]
tm.assert_frame_equal(result, expected)
result = df.rolling(window='2s', min_periods=1).median()
expected = df.copy()
expected['B'] = [0.0, 1, 1.5, 3.0, 3.5]
tm.assert_frame_equal(result, expected)
def test_ragged_quantile(self):
df = self.ragged
result = df.rolling(window='1s', min_periods=1).quantile(0.5)
expected = df.copy()
expected['B'] = [0.0, 1, 2, 3, 4]
tm.assert_frame_equal(result, expected)
result = df.rolling(window='2s', min_periods=1).quantile(0.5)
expected = df.copy()
expected['B'] = [0.0, 1, 1.5, 3.0, 3.5]
tm.assert_frame_equal(result, expected)
def test_ragged_std(self):
df = self.ragged
result = df.rolling(window='1s', min_periods=1).std(ddof=0)
expected = df.copy()
expected['B'] = [0.0] * 5
tm.assert_frame_equal(result, expected)
result = df.rolling(window='1s', min_periods=1).std(ddof=1)
expected = df.copy()
expected['B'] = [np.nan] * 5
tm.assert_frame_equal(result, expected)
result = df.rolling(window='3s', min_periods=1).std(ddof=0)
expected = df.copy()
expected['B'] = [0.0] + [0.5] * 4
tm.assert_frame_equal(result, expected)
result = df.rolling(window='5s', min_periods=1).std(ddof=1)
expected = df.copy()
expected['B'] = [np.nan, 0.707107, 1.0, 1.0, 1.290994]
tm.assert_frame_equal(result, expected)
def test_ragged_var(self):
df = self.ragged
result = df.rolling(window='1s', min_periods=1).var(ddof=0)
expected = df.copy()
expected['B'] = [0.0] * 5
tm.assert_frame_equal(result, expected)
result = df.rolling(window='1s', min_periods=1).var(ddof=1)
expected = df.copy()
expected['B'] = [np.nan] * 5
tm.assert_frame_equal(result, expected)
result = df.rolling(window='3s', min_periods=1).var(ddof=0)
expected = df.copy()
expected['B'] = [0.0] + [0.25] * 4
tm.assert_frame_equal(result, expected)
result = df.rolling(window='5s', min_periods=1).var(ddof=1)
expected = df.copy()
expected['B'] = [np.nan, 0.5, 1.0, 1.0, 1 + 2 / 3.]
tm.assert_frame_equal(result, expected)
def test_ragged_skew(self):
df = self.ragged
result = df.rolling(window='3s', min_periods=1).skew()
expected = df.copy()
expected['B'] = [np.nan] * 5
tm.assert_frame_equal(result, expected)
result = df.rolling(window='5s', min_periods=1).skew()
expected = df.copy()
expected['B'] = [np.nan] * 2 + [0.0, 0.0, 0.0]
tm.assert_frame_equal(result, expected)
def test_ragged_kurt(self):
df = self.ragged
result = df.rolling(window='3s', min_periods=1).kurt()
expected = df.copy()
expected['B'] = [np.nan] * 5
tm.assert_frame_equal(result, expected)
result = df.rolling(window='5s', min_periods=1).kurt()
expected = df.copy()
expected['B'] = [np.nan] * 4 + [-1.2]
tm.assert_frame_equal(result, expected)
def test_ragged_count(self):
df = self.ragged
result = df.rolling(window='1s', min_periods=1).count()
expected = df.copy()
expected['B'] = [1.0, 1, 1, 1, 1]
tm.assert_frame_equal(result, expected)
df = self.ragged
result = df.rolling(window='1s').count()
tm.assert_frame_equal(result, expected)
result = df.rolling(window='2s', min_periods=1).count()
expected = df.copy()
expected['B'] = [1.0, 1, 2, 1, 2]
tm.assert_frame_equal(result, expected)
result = df.rolling(window='2s', min_periods=2).count()
expected = df.copy()
expected['B'] = [np.nan, np.nan, 2, np.nan, 2]
tm.assert_frame_equal(result, expected)
def test_regular_min(self):
df = DataFrame({'A': pd.date_range('20130101',
periods=5,
freq='s'),
'B': [0.0, 1, 2, 3, 4]}).set_index('A')
result = df.rolling('1s').min()
expected = df.copy()
expected['B'] = [0.0, 1, 2, 3, 4]
tm.assert_frame_equal(result, expected)
df = DataFrame({'A': pd.date_range('20130101',
periods=5,
freq='s'),
'B': [5, 4, 3, 4, 5]}).set_index('A')
tm.assert_frame_equal(result, expected)
result = df.rolling('2s').min()
expected = df.copy()
expected['B'] = [5.0, 4, 3, 3, 4]
tm.assert_frame_equal(result, expected)
result = df.rolling('5s').min()
expected = df.copy()
expected['B'] = [5.0, 4, 3, 3, 3]
tm.assert_frame_equal(result, expected)
def test_ragged_min(self):
df = self.ragged
result = df.rolling(window='1s', min_periods=1).min()
expected = df.copy()
expected['B'] = [0.0, 1, 2, 3, 4]
tm.assert_frame_equal(result, expected)
result = df.rolling(window='2s', min_periods=1).min()
expected = df.copy()
expected['B'] = [0.0, 1, 1, 3, 3]
tm.assert_frame_equal(result, expected)
result = df.rolling(window='5s', min_periods=1).min()
expected = df.copy()
expected['B'] = [0.0, 0, 0, 1, 1]
tm.assert_frame_equal(result, expected)
def test_perf_min(self):
N = 10000
dfp = DataFrame({'B': np.random.randn(N)},
index=pd.date_range('20130101',
periods=N,
freq='s'))
expected = dfp.rolling(2, min_periods=1).min()
result = dfp.rolling('2s').min()
assert ((result - expected) < 0.01).all().bool()
expected = dfp.rolling(200, min_periods=1).min()
result = dfp.rolling('200s').min()
assert ((result - expected) < 0.01).all().bool()
def test_ragged_max(self):
df = self.ragged
result = df.rolling(window='1s', min_periods=1).max()
expected = df.copy()
expected['B'] = [0.0, 1, 2, 3, 4]
tm.assert_frame_equal(result, expected)
result = df.rolling(window='2s', min_periods=1).max()
expected = df.copy()
expected['B'] = [0.0, 1, 2, 3, 4]
tm.assert_frame_equal(result, expected)
result = df.rolling(window='5s', min_periods=1).max()
expected = df.copy()
expected['B'] = [0.0, 1, 2, 3, 4]
tm.assert_frame_equal(result, expected)
def test_ragged_apply(self, raw):
df = self.ragged
f = lambda x: 1
result = df.rolling(window='1s', min_periods=1).apply(f, raw=raw)
expected = df.copy()
expected['B'] = 1.
tm.assert_frame_equal(result, expected)
result = df.rolling(window='2s', min_periods=1).apply(f, raw=raw)
expected = df.copy()
expected['B'] = 1.
tm.assert_frame_equal(result, expected)
result = df.rolling(window='5s', min_periods=1).apply(f, raw=raw)
expected = df.copy()
expected['B'] = 1.
tm.assert_frame_equal(result, expected)
def test_all(self):
# simple comparison of integer vs time-based windowing
df = self.regular * 2
er = df.rolling(window=1)
r = df.rolling(window='1s')
for f in ['sum', 'mean', 'count', 'median', 'std',
'var', 'kurt', 'skew', 'min', 'max']:
result = getattr(r, f)()
expected = getattr(er, f)()
tm.assert_frame_equal(result, expected)
result = r.quantile(0.5)
expected = er.quantile(0.5)
tm.assert_frame_equal(result, expected)
def test_all_apply(self, raw):
df = self.regular * 2
er = df.rolling(window=1)
r = df.rolling(window='1s')
result = r.apply(lambda x: 1, raw=raw)
expected = er.apply(lambda x: 1, raw=raw)
tm.assert_frame_equal(result, expected)
def test_all2(self):
# more sophisticated comparison of integer vs.
# time-based windowing
df = DataFrame({'B': np.arange(50)},
index=pd.date_range('20130101',
periods=50, freq='H')
)
# in-range data
dft = df.between_time("09:00", "16:00")
r = dft.rolling(window='5H')
for f in ['sum', 'mean', 'count', 'median', 'std',
'var', 'kurt', 'skew', 'min', 'max']:
result = getattr(r, f)()
# we need to roll the days separately
# to compare with a time-based roll
# finally groupby-apply will return a multi-index
# so we need to drop the day
def agg_by_day(x):
x = x.between_time("09:00", "16:00")
return getattr(x.rolling(5, min_periods=1), f)()
expected = df.groupby(df.index.day).apply(
agg_by_day).reset_index(level=0, drop=True)
tm.assert_frame_equal(result, expected)
def test_groupby_monotonic(self):
# GH 15130
# we don't need to validate monotonicity when grouping
data = [
['David', '1/1/2015', 100], ['David', '1/5/2015', 500],
['David', '5/30/2015', 50], ['David', '7/25/2015', 50],
['Ryan', '1/4/2014', 100], ['Ryan', '1/19/2015', 500],
['Ryan', '3/31/2016', 50], ['Joe', '7/1/2015', 100],
['Joe', '9/9/2015', 500], ['Joe', '10/15/2015', 50]]
df = DataFrame(data=data, columns=['name', 'date', 'amount'])
df['date'] = pd.to_datetime(df['date'])
expected = df.set_index('date').groupby('name').apply(
lambda x: x.rolling('180D')['amount'].sum())
result = df.groupby('name').rolling('180D', on='date')['amount'].sum()
tm.assert_series_equal(result, expected)
def test_non_monotonic(self):
# GH 13966 (similar to #15130, closed by #15175)
dates = pd.date_range(start='2016-01-01 09:30:00',
periods=20, freq='s')
df = DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8,
'B': np.concatenate((dates, dates)),
'C': np.arange(40)})
result = df.groupby('A').rolling('4s', on='B').C.mean()
expected = df.set_index('B').groupby('A').apply(
lambda x: x.rolling('4s')['C'].mean())
tm.assert_series_equal(result, expected)
df2 = df.sort_values('B')
result = df2.groupby('A').rolling('4s', on='B').C.mean()
tm.assert_series_equal(result, expected)
def test_rolling_cov_offset(self):
# GH16058
idx = pd.date_range('2017-01-01', periods=24, freq='1h')
ss = Series(np.arange(len(idx)), index=idx)
result = ss.rolling('2h').cov()
expected = Series([np.nan] + [0.5] * (len(idx) - 1), index=idx)
tm.assert_series_equal(result, expected)
expected2 = ss.rolling(2, min_periods=1).cov()
tm.assert_series_equal(result, expected2)
result = ss.rolling('3h').cov()
expected = Series([np.nan, 0.5] + [1.0] * (len(idx) - 2), index=idx)
tm.assert_series_equal(result, expected)
expected2 = ss.rolling(3, min_periods=1).cov()
tm.assert_series_equal(result, expected2)