laywerrobot/lib/python3.6/site-packages/pandas/tests/test_resample.py
2020-08-27 21:55:39 +02:00

3414 lines
131 KiB
Python

# pylint: disable=E1101
from warnings import catch_warnings
from datetime import datetime, timedelta
from functools import partial
from textwrap import dedent
from operator import methodcaller
import pytz
import pytest
import dateutil
import numpy as np
import pandas as pd
import pandas.tseries.offsets as offsets
import pandas.util.testing as tm
from pandas import (Series, DataFrame, Panel, Index, isna,
notna, Timestamp)
from pandas.compat import range, lrange, zip, product, OrderedDict
from pandas.errors import UnsupportedFunctionCall
from pandas.core.groupby.groupby import DataError
import pandas.core.common as com
from pandas.tseries.frequencies import to_offset
from pandas.core.indexes.datetimes import date_range
from pandas.tseries.offsets import Minute, BDay
from pandas.core.indexes.period import period_range, PeriodIndex, Period
from pandas.core.resample import DatetimeIndex, TimeGrouper
from pandas.core.indexes.timedeltas import timedelta_range, TimedeltaIndex
from pandas.util.testing import (assert_series_equal, assert_almost_equal,
assert_frame_equal, assert_index_equal)
from pandas._libs.tslibs.period import IncompatibleFrequency
from pandas._libs.tslibs.ccalendar import DAYS, MONTHS
bday = BDay()
# The various methods we support
downsample_methods = ['min', 'max', 'first', 'last', 'sum', 'mean', 'sem',
'median', 'prod', 'var', 'ohlc']
upsample_methods = ['count', 'size']
series_methods = ['nunique']
resample_methods = downsample_methods + upsample_methods + series_methods
def _simple_ts(start, end, freq='D'):
rng = date_range(start, end, freq=freq)
return Series(np.random.randn(len(rng)), index=rng)
def _simple_pts(start, end, freq='D'):
rng = period_range(start, end, freq=freq)
return Series(np.random.randn(len(rng)), index=rng)
class TestResampleAPI(object):
def setup_method(self, method):
dti = DatetimeIndex(start=datetime(2005, 1, 1),
end=datetime(2005, 1, 10), freq='Min')
self.series = Series(np.random.rand(len(dti)), dti)
self.frame = DataFrame(
{'A': self.series, 'B': self.series, 'C': np.arange(len(dti))})
def test_str(self):
r = self.series.resample('H')
assert ('DatetimeIndexResampler [freq=<Hour>, axis=0, closed=left, '
'label=left, convention=start, base=0]' in str(r))
def test_api(self):
r = self.series.resample('H')
result = r.mean()
assert isinstance(result, Series)
assert len(result) == 217
r = self.series.to_frame().resample('H')
result = r.mean()
assert isinstance(result, DataFrame)
assert len(result) == 217
def test_groupby_resample_api(self):
# GH 12448
# .groupby(...).resample(...) hitting warnings
# when appropriate
df = DataFrame({'date': pd.date_range(start='2016-01-01',
periods=4,
freq='W'),
'group': [1, 1, 2, 2],
'val': [5, 6, 7, 8]}).set_index('date')
# replication step
i = pd.date_range('2016-01-03', periods=8).tolist() + \
pd.date_range('2016-01-17', periods=8).tolist()
index = pd.MultiIndex.from_arrays([[1] * 8 + [2] * 8, i],
names=['group', 'date'])
expected = DataFrame({'val': [5] * 7 + [6] + [7] * 7 + [8]},
index=index)
result = df.groupby('group').apply(
lambda x: x.resample('1D').ffill())[['val']]
assert_frame_equal(result, expected)
def test_groupby_resample_on_api(self):
# GH 15021
# .groupby(...).resample(on=...) results in an unexpected
# keyword warning.
df = DataFrame({'key': ['A', 'B'] * 5,
'dates': pd.date_range('2016-01-01', periods=10),
'values': np.random.randn(10)})
expected = df.set_index('dates').groupby('key').resample('D').mean()
result = df.groupby('key').resample('D', on='dates').mean()
assert_frame_equal(result, expected)
def test_pipe(self):
# GH17905
# series
r = self.series.resample('H')
expected = r.max() - r.mean()
result = r.pipe(lambda x: x.max() - x.mean())
tm.assert_series_equal(result, expected)
# dataframe
r = self.frame.resample('H')
expected = r.max() - r.mean()
result = r.pipe(lambda x: x.max() - x.mean())
tm.assert_frame_equal(result, expected)
def test_getitem(self):
r = self.frame.resample('H')
tm.assert_index_equal(r._selected_obj.columns, self.frame.columns)
r = self.frame.resample('H')['B']
assert r._selected_obj.name == self.frame.columns[1]
# technically this is allowed
r = self.frame.resample('H')['A', 'B']
tm.assert_index_equal(r._selected_obj.columns,
self.frame.columns[[0, 1]])
r = self.frame.resample('H')['A', 'B']
tm.assert_index_equal(r._selected_obj.columns,
self.frame.columns[[0, 1]])
def test_select_bad_cols(self):
g = self.frame.resample('H')
pytest.raises(KeyError, g.__getitem__, ['D'])
pytest.raises(KeyError, g.__getitem__, ['A', 'D'])
with tm.assert_raises_regex(KeyError, '^[^A]+$'):
# A should not be referenced as a bad column...
# will have to rethink regex if you change message!
g[['A', 'D']]
def test_attribute_access(self):
r = self.frame.resample('H')
tm.assert_series_equal(r.A.sum(), r['A'].sum())
def test_api_compat_before_use(self):
# make sure that we are setting the binner
# on these attributes
for attr in ['groups', 'ngroups', 'indices']:
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = Series(np.arange(len(rng)), index=rng)
rs = ts.resample('30s')
# before use
getattr(rs, attr)
# after grouper is initialized is ok
rs.mean()
getattr(rs, attr)
def tests_skip_nuisance(self):
df = self.frame
df['D'] = 'foo'
r = df.resample('H')
result = r[['A', 'B']].sum()
expected = pd.concat([r.A.sum(), r.B.sum()], axis=1)
assert_frame_equal(result, expected)
expected = r[['A', 'B', 'C']].sum()
result = r.sum()
assert_frame_equal(result, expected)
def test_downsample_but_actually_upsampling(self):
# this is reindex / asfreq
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = Series(np.arange(len(rng), dtype='int64'), index=rng)
result = ts.resample('20s').asfreq()
expected = Series([0, 20, 40, 60, 80],
index=pd.date_range('2012-01-01 00:00:00',
freq='20s',
periods=5))
assert_series_equal(result, expected)
def test_combined_up_downsampling_of_irregular(self):
# since we are reallydoing an operation like this
# ts2.resample('2s').mean().ffill()
# preserve these semantics
rng = pd.date_range('1/1/2012', periods=100, freq='S')
ts = Series(np.arange(len(rng)), index=rng)
ts2 = ts.iloc[[0, 1, 2, 3, 5, 7, 11, 15, 16, 25, 30]]
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = ts2.resample('2s', how='mean', fill_method='ffill')
expected = ts2.resample('2s').mean().ffill()
assert_series_equal(result, expected)
def test_transform(self):
r = self.series.resample('20min')
expected = self.series.groupby(
pd.Grouper(freq='20min')).transform('mean')
result = r.transform('mean')
assert_series_equal(result, expected)
def test_fillna(self):
# need to upsample here
rng = pd.date_range('1/1/2012', periods=10, freq='2S')
ts = Series(np.arange(len(rng), dtype='int64'), index=rng)
r = ts.resample('s')
expected = r.ffill()
result = r.fillna(method='ffill')
assert_series_equal(result, expected)
expected = r.bfill()
result = r.fillna(method='bfill')
assert_series_equal(result, expected)
with pytest.raises(ValueError):
r.fillna(0)
def test_apply_without_aggregation(self):
# both resample and groupby should work w/o aggregation
r = self.series.resample('20min')
g = self.series.groupby(pd.Grouper(freq='20min'))
for t in [g, r]:
result = t.apply(lambda x: x)
assert_series_equal(result, self.series)
def test_agg_consistency(self):
# make sure that we are consistent across
# similar aggregations with and w/o selection list
df = DataFrame(np.random.randn(1000, 3),
index=pd.date_range('1/1/2012', freq='S', periods=1000),
columns=['A', 'B', 'C'])
r = df.resample('3T')
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
expected = r[['A', 'B', 'C']].agg({'r1': 'mean', 'r2': 'sum'})
result = r.agg({'r1': 'mean', 'r2': 'sum'})
assert_frame_equal(result, expected)
# TODO: once GH 14008 is fixed, move these tests into
# `Base` test class
def test_agg(self):
# test with all three Resampler apis and TimeGrouper
np.random.seed(1234)
index = date_range(datetime(2005, 1, 1),
datetime(2005, 1, 10), freq='D')
index.name = 'date'
df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
df_col = df.reset_index()
df_mult = df_col.copy()
df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
names=['index', 'date'])
r = df.resample('2D')
cases = [
r,
df_col.resample('2D', on='date'),
df_mult.resample('2D', level='date'),
df.groupby(pd.Grouper(freq='2D'))
]
a_mean = r['A'].mean()
a_std = r['A'].std()
a_sum = r['A'].sum()
b_mean = r['B'].mean()
b_std = r['B'].std()
b_sum = r['B'].sum()
expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
expected.columns = pd.MultiIndex.from_product([['A', 'B'],
['mean', 'std']])
for t in cases:
result = t.aggregate([np.mean, np.std])
assert_frame_equal(result, expected)
expected = pd.concat([a_mean, b_std], axis=1)
for t in cases:
result = t.aggregate({'A': np.mean,
'B': np.std})
assert_frame_equal(result, expected, check_like=True)
expected = pd.concat([a_mean, a_std], axis=1)
expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
('A', 'std')])
for t in cases:
result = t.aggregate({'A': ['mean', 'std']})
assert_frame_equal(result, expected)
expected = pd.concat([a_mean, a_sum], axis=1)
expected.columns = ['mean', 'sum']
for t in cases:
result = t['A'].aggregate(['mean', 'sum'])
assert_frame_equal(result, expected)
expected = pd.concat([a_mean, a_sum], axis=1)
expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
('A', 'sum')])
for t in cases:
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'}})
assert_frame_equal(result, expected, check_like=True)
expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1)
expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
('A', 'sum'),
('B', 'mean2'),
('B', 'sum2')])
for t in cases:
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = t.aggregate({'A': {'mean': 'mean', 'sum': 'sum'},
'B': {'mean2': 'mean', 'sum2': 'sum'}})
assert_frame_equal(result, expected, check_like=True)
expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
expected.columns = pd.MultiIndex.from_tuples([('A', 'mean'),
('A', 'std'),
('B', 'mean'),
('B', 'std')])
for t in cases:
result = t.aggregate({'A': ['mean', 'std'],
'B': ['mean', 'std']})
assert_frame_equal(result, expected, check_like=True)
expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1)
expected.columns = pd.MultiIndex.from_tuples([('r1', 'A', 'mean'),
('r1', 'A', 'sum'),
('r2', 'B', 'mean'),
('r2', 'B', 'sum')])
def test_agg_misc(self):
# test with all three Resampler apis and TimeGrouper
np.random.seed(1234)
index = date_range(datetime(2005, 1, 1),
datetime(2005, 1, 10), freq='D')
index.name = 'date'
df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
df_col = df.reset_index()
df_mult = df_col.copy()
df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
names=['index', 'date'])
r = df.resample('2D')
cases = [
r,
df_col.resample('2D', on='date'),
df_mult.resample('2D', level='date'),
df.groupby(pd.Grouper(freq='2D'))
]
# passed lambda
for t in cases:
result = t.agg({'A': np.sum,
'B': lambda x: np.std(x, ddof=1)})
rcustom = t['B'].apply(lambda x: np.std(x, ddof=1))
expected = pd.concat([r['A'].sum(), rcustom], axis=1)
assert_frame_equal(result, expected, check_like=True)
# agg with renamers
expected = pd.concat([t['A'].sum(),
t['B'].sum(),
t['A'].mean(),
t['B'].mean()],
axis=1)
expected.columns = pd.MultiIndex.from_tuples([('result1', 'A'),
('result1', 'B'),
('result2', 'A'),
('result2', 'B')])
for t in cases:
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = t[['A', 'B']].agg(OrderedDict([('result1', np.sum),
('result2', np.mean)]))
assert_frame_equal(result, expected, check_like=True)
# agg with different hows
expected = pd.concat([t['A'].sum(),
t['A'].std(),
t['B'].mean(),
t['B'].std()],
axis=1)
expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
('A', 'std'),
('B', 'mean'),
('B', 'std')])
for t in cases:
result = t.agg(OrderedDict([('A', ['sum', 'std']),
('B', ['mean', 'std'])]))
assert_frame_equal(result, expected, check_like=True)
# equivalent of using a selection list / or not
for t in cases:
result = t[['A', 'B']].agg({'A': ['sum', 'std'],
'B': ['mean', 'std']})
assert_frame_equal(result, expected, check_like=True)
# series like aggs
for t in cases:
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = t['A'].agg({'A': ['sum', 'std']})
expected = pd.concat([t['A'].sum(),
t['A'].std()],
axis=1)
expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
('A', 'std')])
assert_frame_equal(result, expected, check_like=True)
expected = pd.concat([t['A'].agg(['sum', 'std']),
t['A'].agg(['mean', 'std'])],
axis=1)
expected.columns = pd.MultiIndex.from_tuples([('A', 'sum'),
('A', 'std'),
('B', 'mean'),
('B', 'std')])
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = t['A'].agg({'A': ['sum', 'std'],
'B': ['mean', 'std']})
assert_frame_equal(result, expected, check_like=True)
# errors
# invalid names in the agg specification
for t in cases:
def f():
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
t[['A']].agg({'A': ['sum', 'std'],
'B': ['mean', 'std']})
pytest.raises(KeyError, f)
def test_agg_nested_dicts(self):
np.random.seed(1234)
index = date_range(datetime(2005, 1, 1),
datetime(2005, 1, 10), freq='D')
index.name = 'date'
df = DataFrame(np.random.rand(10, 2), columns=list('AB'), index=index)
df_col = df.reset_index()
df_mult = df_col.copy()
df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
names=['index', 'date'])
r = df.resample('2D')
cases = [
r,
df_col.resample('2D', on='date'),
df_mult.resample('2D', level='date'),
df.groupby(pd.Grouper(freq='2D'))
]
for t in cases:
def f():
t.aggregate({'r1': {'A': ['mean', 'sum']},
'r2': {'B': ['mean', 'sum']}})
pytest.raises(ValueError, f)
for t in cases:
expected = pd.concat([t['A'].mean(), t['A'].std(), t['B'].mean(),
t['B'].std()], axis=1)
expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), (
'ra', 'std'), ('rb', 'mean'), ('rb', 'std')])
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = t[['A', 'B']].agg({'A': {'ra': ['mean', 'std']},
'B': {'rb': ['mean', 'std']}})
assert_frame_equal(result, expected, check_like=True)
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = t.agg({'A': {'ra': ['mean', 'std']},
'B': {'rb': ['mean', 'std']}})
assert_frame_equal(result, expected, check_like=True)
def test_try_aggregate_non_existing_column(self):
# GH 16766
data = [
{'dt': datetime(2017, 6, 1, 0), 'x': 1.0, 'y': 2.0},
{'dt': datetime(2017, 6, 1, 1), 'x': 2.0, 'y': 2.0},
{'dt': datetime(2017, 6, 1, 2), 'x': 3.0, 'y': 1.5}
]
df = DataFrame(data).set_index('dt')
# Error as we don't have 'z' column
with pytest.raises(KeyError):
df.resample('30T').agg({'x': ['mean'],
'y': ['median'],
'z': ['sum']})
def test_selection_api_validation(self):
# GH 13500
index = date_range(datetime(2005, 1, 1),
datetime(2005, 1, 10), freq='D')
rng = np.arange(len(index), dtype=np.int64)
df = DataFrame({'date': index, 'a': rng},
index=pd.MultiIndex.from_arrays([rng, index],
names=['v', 'd']))
df_exp = DataFrame({'a': rng}, index=index)
# non DatetimeIndex
with pytest.raises(TypeError):
df.resample('2D', level='v')
with pytest.raises(ValueError):
df.resample('2D', on='date', level='d')
with pytest.raises(TypeError):
df.resample('2D', on=['a', 'date'])
with pytest.raises(KeyError):
df.resample('2D', level=['a', 'date'])
# upsampling not allowed
with pytest.raises(ValueError):
df.resample('2D', level='d').asfreq()
with pytest.raises(ValueError):
df.resample('2D', on='date').asfreq()
exp = df_exp.resample('2D').sum()
exp.index.name = 'date'
assert_frame_equal(exp, df.resample('2D', on='date').sum())
exp.index.name = 'd'
assert_frame_equal(exp, df.resample('2D', level='d').sum())
class Base(object):
"""
base class for resampling testing, calling
.create_series() generates a series of each index type
"""
def create_index(self, *args, **kwargs):
""" return the _index_factory created using the args, kwargs """
factory = self._index_factory()
return factory(*args, **kwargs)
@pytest.fixture
def _index_start(self):
return datetime(2005, 1, 1)
@pytest.fixture
def _index_end(self):
return datetime(2005, 1, 10)
@pytest.fixture
def _index_freq(self):
return 'D'
@pytest.fixture
def index(self, _index_start, _index_end, _index_freq):
return self.create_index(_index_start, _index_end, freq=_index_freq)
@pytest.fixture
def _series_name(self):
raise com.AbstractMethodError(self)
@pytest.fixture
def _static_values(self, index):
return np.arange(len(index))
@pytest.fixture
def series(self, index, _series_name, _static_values):
return Series(_static_values, index=index, name=_series_name)
@pytest.fixture
def frame(self, index, _static_values):
return DataFrame({'value': _static_values}, index=index)
@pytest.fixture(params=[Series, DataFrame])
def series_and_frame(self, request, index, _series_name, _static_values):
if request.param == Series:
return Series(_static_values, index=index, name=_series_name)
if request.param == DataFrame:
return DataFrame({'value': _static_values}, index=index)
@pytest.mark.parametrize('freq', ['2D', '1H'])
def test_asfreq(self, series_and_frame, freq):
obj = series_and_frame
result = obj.resample(freq).asfreq()
if freq == '2D':
new_index = obj.index.take(np.arange(0, len(obj.index), 2))
new_index.freq = to_offset('2D')
else:
new_index = self.create_index(obj.index[0], obj.index[-1],
freq=freq)
expected = obj.reindex(new_index)
assert_almost_equal(result, expected)
def test_asfreq_fill_value(self):
# test for fill value during resampling, issue 3715
s = self.create_series()
result = s.resample('1H').asfreq()
new_index = self.create_index(s.index[0], s.index[-1], freq='1H')
expected = s.reindex(new_index)
assert_series_equal(result, expected)
frame = s.to_frame('value')
frame.iloc[1] = None
result = frame.resample('1H').asfreq(fill_value=4.0)
new_index = self.create_index(frame.index[0],
frame.index[-1], freq='1H')
expected = frame.reindex(new_index, fill_value=4.0)
assert_frame_equal(result, expected)
def test_resample_interpolate(self):
# # 12925
df = self.create_series().to_frame('value')
assert_frame_equal(
df.resample('1T').asfreq().interpolate(),
df.resample('1T').interpolate())
def test_raises_on_non_datetimelike_index(self):
# this is a non datetimelike index
xp = DataFrame()
pytest.raises(TypeError, lambda: xp.resample('A').mean())
def test_resample_empty_series(self):
# GH12771 & GH12868
s = self.create_series()[:0]
for freq in ['M', 'D', 'H']:
# need to test for ohlc from GH13083
methods = [method for method in resample_methods
if method != 'ohlc']
for method in methods:
result = getattr(s.resample(freq), method)()
expected = s.copy()
expected.index = s.index._shallow_copy(freq=freq)
assert_index_equal(result.index, expected.index)
assert result.index.freq == expected.index.freq
assert_series_equal(result, expected, check_dtype=False)
def test_resample_empty_dataframe(self):
# GH13212
index = self.create_series().index[:0]
f = DataFrame(index=index)
for freq in ['M', 'D', 'H']:
# count retains dimensions too
methods = downsample_methods + upsample_methods
for method in methods:
result = getattr(f.resample(freq), method)()
if method != 'size':
expected = f.copy()
else:
# GH14962
expected = Series([])
expected.index = f.index._shallow_copy(freq=freq)
assert_index_equal(result.index, expected.index)
assert result.index.freq == expected.index.freq
assert_almost_equal(result, expected, check_dtype=False)
# test size for GH13212 (currently stays as df)
@pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0))
@pytest.mark.parametrize(
"dtype",
[np.float, np.int, np.object, 'datetime64[ns]'])
def test_resample_empty_dtypes(self, index, dtype):
# Empty series were sometimes causing a segfault (for the functions
# with Cython bounds-checking disabled) or an IndexError. We just run
# them to ensure they no longer do. (GH #10228)
for how in downsample_methods + upsample_methods:
empty_series = Series([], index, dtype)
try:
getattr(empty_series.resample('d'), how)()
except DataError:
# Ignore these since some combinations are invalid
# (ex: doing mean with dtype of np.object)
pass
def test_resample_loffset_arg_type(self):
# GH 13218, 15002
df = self.create_series().to_frame('value')
expected_means = [df.values[i:i + 2].mean()
for i in range(0, len(df.values), 2)]
expected_index = self.create_index(df.index[0],
periods=len(df.index) / 2,
freq='2D')
# loffset coerces PeriodIndex to DateTimeIndex
if isinstance(expected_index, PeriodIndex):
expected_index = expected_index.to_timestamp()
expected_index += timedelta(hours=2)
expected = DataFrame({'value': expected_means}, index=expected_index)
for arg in ['mean', {'value': 'mean'}, ['mean']]:
result_agg = df.resample('2D', loffset='2H').agg(arg)
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result_how = df.resample('2D', how=arg, loffset='2H')
if isinstance(arg, list):
expected.columns = pd.MultiIndex.from_tuples([('value',
'mean')])
# GH 13022, 7687 - TODO: fix resample w/ TimedeltaIndex
if isinstance(expected.index, TimedeltaIndex):
with pytest.raises(AssertionError):
assert_frame_equal(result_agg, expected)
assert_frame_equal(result_how, expected)
else:
assert_frame_equal(result_agg, expected)
assert_frame_equal(result_how, expected)
def test_apply_to_empty_series(self):
# GH 14313
series = self.create_series()[:0]
for freq in ['M', 'D', 'H']:
result = series.resample(freq).apply(lambda x: 1)
expected = series.resample(freq).apply(np.sum)
assert_series_equal(result, expected, check_dtype=False)
class TestDatetimeIndex(Base):
_index_factory = lambda x: date_range
@pytest.fixture
def _series_name(self):
return 'dti'
def setup_method(self, method):
dti = DatetimeIndex(start=datetime(2005, 1, 1),
end=datetime(2005, 1, 10), freq='Min')
self.series = Series(np.random.rand(len(dti)), dti)
def create_series(self):
i = date_range(datetime(2005, 1, 1),
datetime(2005, 1, 10), freq='D')
return Series(np.arange(len(i)), index=i, name='dti')
def test_custom_grouper(self):
dti = DatetimeIndex(freq='Min', start=datetime(2005, 1, 1),
end=datetime(2005, 1, 10))
s = Series(np.array([1] * len(dti)), index=dti, dtype='int64')
b = TimeGrouper(Minute(5))
g = s.groupby(b)
# check all cython functions work
funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var']
for f in funcs:
g._cython_agg_general(f)
b = TimeGrouper(Minute(5), closed='right', label='right')
g = s.groupby(b)
# check all cython functions work
funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var']
for f in funcs:
g._cython_agg_general(f)
assert g.ngroups == 2593
assert notna(g.mean()).all()
# construct expected val
arr = [1] + [5] * 2592
idx = dti[0:-1:5]
idx = idx.append(dti[-1:])
expect = Series(arr, index=idx)
# GH2763 - return in put dtype if we can
result = g.agg(np.sum)
assert_series_equal(result, expect)
df = DataFrame(np.random.rand(len(dti), 10),
index=dti, dtype='float64')
r = df.groupby(b).agg(np.sum)
assert len(r.columns) == 10
assert len(r.index) == 2593
def test_resample_basic(self):
rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min',
name='index')
s = Series(np.random.randn(14), index=rng)
result = s.resample('5min', closed='right', label='right').mean()
exp_idx = date_range('1/1/2000', periods=4, freq='5min', name='index')
expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()],
index=exp_idx)
assert_series_equal(result, expected)
assert result.index.name == 'index'
result = s.resample('5min', closed='left', label='right').mean()
exp_idx = date_range('1/1/2000 00:05', periods=3, freq='5min',
name='index')
expected = Series([s[:5].mean(), s[5:10].mean(),
s[10:].mean()], index=exp_idx)
assert_series_equal(result, expected)
s = self.series
result = s.resample('5Min').last()
grouper = TimeGrouper(Minute(5), closed='left', label='left')
expect = s.groupby(grouper).agg(lambda x: x[-1])
assert_series_equal(result, expect)
def test_resample_string_kwargs(self):
# Test for issue #19303
rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min',
name='index')
s = Series(np.random.randn(14), index=rng)
# Check that wrong keyword argument strings raise an error
with pytest.raises(ValueError):
s.resample('5min', label='righttt').mean()
with pytest.raises(ValueError):
s.resample('5min', closed='righttt').mean()
with pytest.raises(ValueError):
s.resample('5min', convention='starttt').mean()
def test_resample_how(self):
rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min',
name='index')
s = Series(np.random.randn(14), index=rng)
grouplist = np.ones_like(s)
grouplist[0] = 0
grouplist[1:6] = 1
grouplist[6:11] = 2
grouplist[11:] = 3
args = downsample_methods
def _ohlc(group):
if isna(group).all():
return np.repeat(np.nan, 4)
return [group[0], group.max(), group.min(), group[-1]]
inds = date_range('1/1/2000', periods=4, freq='5min', name='index')
for arg in args:
if arg == 'ohlc':
func = _ohlc
else:
func = arg
try:
result = getattr(s.resample(
'5min', closed='right', label='right'), arg)()
expected = s.groupby(grouplist).agg(func)
assert result.index.name == 'index'
if arg == 'ohlc':
expected = DataFrame(expected.values.tolist())
expected.columns = ['open', 'high', 'low', 'close']
expected.index = Index(inds, name='index')
assert_frame_equal(result, expected)
else:
expected.index = inds
assert_series_equal(result, expected)
except BaseException as exc:
exc.args += ('how=%s' % arg,)
raise
def test_numpy_compat(self):
# see gh-12811
s = Series([1, 2, 3, 4, 5], index=date_range(
'20130101', periods=5, freq='s'))
r = s.resample('2s')
msg = "numpy operations are not valid with resample"
for func in ('min', 'max', 'sum', 'prod',
'mean', 'var', 'std'):
tm.assert_raises_regex(UnsupportedFunctionCall, msg,
getattr(r, func),
func, 1, 2, 3)
tm.assert_raises_regex(UnsupportedFunctionCall, msg,
getattr(r, func), axis=1)
def test_resample_how_callables(self):
# GH 7929
data = np.arange(5, dtype=np.int64)
ind = pd.DatetimeIndex(start='2014-01-01', periods=len(data), freq='d')
df = DataFrame({"A": data, "B": data}, index=ind)
def fn(x, a=1):
return str(type(x))
class FnClass(object):
def __call__(self, x):
return str(type(x))
df_standard = df.resample("M").apply(fn)
df_lambda = df.resample("M").apply(lambda x: str(type(x)))
df_partial = df.resample("M").apply(partial(fn))
df_partial2 = df.resample("M").apply(partial(fn, a=2))
df_class = df.resample("M").apply(FnClass())
assert_frame_equal(df_standard, df_lambda)
assert_frame_equal(df_standard, df_partial)
assert_frame_equal(df_standard, df_partial2)
assert_frame_equal(df_standard, df_class)
def test_resample_with_timedeltas(self):
expected = DataFrame({'A': np.arange(1480)})
expected = expected.groupby(expected.index // 30).sum()
expected.index = pd.timedelta_range('0 days', freq='30T', periods=50)
df = DataFrame({'A': np.arange(1480)}, index=pd.to_timedelta(
np.arange(1480), unit='T'))
result = df.resample('30T').sum()
assert_frame_equal(result, expected)
s = df['A']
result = s.resample('30T').sum()
assert_series_equal(result, expected['A'])
def test_resample_single_period_timedelta(self):
s = Series(list(range(5)), index=pd.timedelta_range(
'1 day', freq='s', periods=5))
result = s.resample('2s').sum()
expected = Series([1, 5, 4], index=pd.timedelta_range(
'1 day', freq='2s', periods=3))
assert_series_equal(result, expected)
def test_resample_timedelta_idempotency(self):
# GH 12072
index = pd.timedelta_range('0', periods=9, freq='10L')
series = Series(range(9), index=index)
result = series.resample('10L').mean()
expected = series
assert_series_equal(result, expected)
def test_resample_rounding(self):
# GH 8371
# odd results when rounding is needed
data = """date,time,value
11-08-2014,00:00:01.093,1
11-08-2014,00:00:02.159,1
11-08-2014,00:00:02.667,1
11-08-2014,00:00:03.175,1
11-08-2014,00:00:07.058,1
11-08-2014,00:00:07.362,1
11-08-2014,00:00:08.324,1
11-08-2014,00:00:08.830,1
11-08-2014,00:00:08.982,1
11-08-2014,00:00:09.815,1
11-08-2014,00:00:10.540,1
11-08-2014,00:00:11.061,1
11-08-2014,00:00:11.617,1
11-08-2014,00:00:13.607,1
11-08-2014,00:00:14.535,1
11-08-2014,00:00:15.525,1
11-08-2014,00:00:17.960,1
11-08-2014,00:00:20.674,1
11-08-2014,00:00:21.191,1"""
from pandas.compat import StringIO
df = pd.read_csv(StringIO(data), parse_dates={'timestamp': [
'date', 'time']}, index_col='timestamp')
df.index.name = None
result = df.resample('6s').sum()
expected = DataFrame({'value': [
4, 9, 4, 2
]}, index=date_range('2014-11-08', freq='6s', periods=4))
assert_frame_equal(result, expected)
result = df.resample('7s').sum()
expected = DataFrame({'value': [
4, 10, 4, 1
]}, index=date_range('2014-11-08', freq='7s', periods=4))
assert_frame_equal(result, expected)
result = df.resample('11s').sum()
expected = DataFrame({'value': [
11, 8
]}, index=date_range('2014-11-08', freq='11s', periods=2))
assert_frame_equal(result, expected)
result = df.resample('13s').sum()
expected = DataFrame({'value': [
13, 6
]}, index=date_range('2014-11-08', freq='13s', periods=2))
assert_frame_equal(result, expected)
result = df.resample('17s').sum()
expected = DataFrame({'value': [
16, 3
]}, index=date_range('2014-11-08', freq='17s', periods=2))
assert_frame_equal(result, expected)
def test_resample_basic_from_daily(self):
# from daily
dti = DatetimeIndex(start=datetime(2005, 1, 1),
end=datetime(2005, 1, 10), freq='D', name='index')
s = Series(np.random.rand(len(dti)), dti)
# to weekly
result = s.resample('w-sun').last()
assert len(result) == 3
assert (result.index.dayofweek == [6, 6, 6]).all()
assert result.iloc[0] == s['1/2/2005']
assert result.iloc[1] == s['1/9/2005']
assert result.iloc[2] == s.iloc[-1]
result = s.resample('W-MON').last()
assert len(result) == 2
assert (result.index.dayofweek == [0, 0]).all()
assert result.iloc[0] == s['1/3/2005']
assert result.iloc[1] == s['1/10/2005']
result = s.resample('W-TUE').last()
assert len(result) == 2
assert (result.index.dayofweek == [1, 1]).all()
assert result.iloc[0] == s['1/4/2005']
assert result.iloc[1] == s['1/10/2005']
result = s.resample('W-WED').last()
assert len(result) == 2
assert (result.index.dayofweek == [2, 2]).all()
assert result.iloc[0] == s['1/5/2005']
assert result.iloc[1] == s['1/10/2005']
result = s.resample('W-THU').last()
assert len(result) == 2
assert (result.index.dayofweek == [3, 3]).all()
assert result.iloc[0] == s['1/6/2005']
assert result.iloc[1] == s['1/10/2005']
result = s.resample('W-FRI').last()
assert len(result) == 2
assert (result.index.dayofweek == [4, 4]).all()
assert result.iloc[0] == s['1/7/2005']
assert result.iloc[1] == s['1/10/2005']
# to biz day
result = s.resample('B').last()
assert len(result) == 7
assert (result.index.dayofweek == [4, 0, 1, 2, 3, 4, 0]).all()
assert result.iloc[0] == s['1/2/2005']
assert result.iloc[1] == s['1/3/2005']
assert result.iloc[5] == s['1/9/2005']
assert result.index.name == 'index'
def test_resample_upsampling_picked_but_not_correct(self):
# Test for issue #3020
dates = date_range('01-Jan-2014', '05-Jan-2014', freq='D')
series = Series(1, index=dates)
result = series.resample('D').mean()
assert result.index[0] == dates[0]
# GH 5955
# incorrect deciding to upsample when the axis frequency matches the
# resample frequency
import datetime
s = Series(np.arange(1., 6), index=[datetime.datetime(
1975, 1, i, 12, 0) for i in range(1, 6)])
expected = Series(np.arange(1., 6), index=date_range(
'19750101', periods=5, freq='D'))
result = s.resample('D').count()
assert_series_equal(result, Series(1, index=expected.index))
result1 = s.resample('D').sum()
result2 = s.resample('D').mean()
assert_series_equal(result1, expected)
assert_series_equal(result2, expected)
def test_resample_frame_basic(self):
df = tm.makeTimeDataFrame()
b = TimeGrouper('M')
g = df.groupby(b)
# check all cython functions work
funcs = ['add', 'mean', 'prod', 'min', 'max', 'var']
for f in funcs:
g._cython_agg_general(f)
result = df.resample('A').mean()
assert_series_equal(result['A'], df['A'].resample('A').mean())
result = df.resample('M').mean()
assert_series_equal(result['A'], df['A'].resample('M').mean())
df.resample('M', kind='period').mean()
df.resample('W-WED', kind='period').mean()
def test_resample_loffset(self):
rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min')
s = Series(np.random.randn(14), index=rng)
result = s.resample('5min', closed='right', label='right',
loffset=timedelta(minutes=1)).mean()
idx = date_range('1/1/2000', periods=4, freq='5min')
expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()],
index=idx + timedelta(minutes=1))
assert_series_equal(result, expected)
expected = s.resample(
'5min', closed='right', label='right',
loffset='1min').mean()
assert_series_equal(result, expected)
expected = s.resample(
'5min', closed='right', label='right',
loffset=Minute(1)).mean()
assert_series_equal(result, expected)
assert result.index.freq == Minute(5)
# from daily
dti = DatetimeIndex(start=datetime(2005, 1, 1),
end=datetime(2005, 1, 10), freq='D')
ser = Series(np.random.rand(len(dti)), dti)
# to weekly
result = ser.resample('w-sun').last()
expected = ser.resample('w-sun', loffset=-bday).last()
assert result.index[0] - bday == expected.index[0]
def test_resample_loffset_upsample(self):
# GH 20744
rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min')
s = Series(np.random.randn(14), index=rng)
result = s.resample('5min', closed='right', label='right',
loffset=timedelta(minutes=1)).ffill()
idx = date_range('1/1/2000', periods=4, freq='5min')
expected = Series([s[0], s[5], s[10], s[-1]],
index=idx + timedelta(minutes=1))
assert_series_equal(result, expected)
def test_resample_loffset_count(self):
# GH 12725
start_time = '1/1/2000 00:00:00'
rng = date_range(start_time, periods=100, freq='S')
ts = Series(np.random.randn(len(rng)), index=rng)
result = ts.resample('10S', loffset='1s').count()
expected_index = (
date_range(start_time, periods=10, freq='10S') +
timedelta(seconds=1)
)
expected = Series(10, index=expected_index)
assert_series_equal(result, expected)
# Same issue should apply to .size() since it goes through
# same code path
result = ts.resample('10S', loffset='1s').size()
assert_series_equal(result, expected)
def test_resample_upsample(self):
# from daily
dti = DatetimeIndex(start=datetime(2005, 1, 1),
end=datetime(2005, 1, 10), freq='D', name='index')
s = Series(np.random.rand(len(dti)), dti)
# to minutely, by padding
result = s.resample('Min').pad()
assert len(result) == 12961
assert result[0] == s[0]
assert result[-1] == s[-1]
assert result.index.name == 'index'
def test_resample_how_method(self):
# GH9915
s = Series([11, 22],
index=[Timestamp('2015-03-31 21:48:52.672000'),
Timestamp('2015-03-31 21:49:52.739000')])
expected = Series([11, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, 22],
index=[Timestamp('2015-03-31 21:48:50'),
Timestamp('2015-03-31 21:49:00'),
Timestamp('2015-03-31 21:49:10'),
Timestamp('2015-03-31 21:49:20'),
Timestamp('2015-03-31 21:49:30'),
Timestamp('2015-03-31 21:49:40'),
Timestamp('2015-03-31 21:49:50')])
assert_series_equal(s.resample("10S").mean(), expected)
def test_resample_extra_index_point(self):
# GH 9756
index = DatetimeIndex(start='20150101', end='20150331', freq='BM')
expected = DataFrame({'A': Series([21, 41, 63], index=index)})
index = DatetimeIndex(start='20150101', end='20150331', freq='B')
df = DataFrame(
{'A': Series(range(len(index)), index=index)}, dtype='int64')
result = df.resample('BM').last()
assert_frame_equal(result, expected)
def test_upsample_with_limit(self):
rng = date_range('1/1/2000', periods=3, freq='5t')
ts = Series(np.random.randn(len(rng)), rng)
result = ts.resample('t').ffill(limit=2)
expected = ts.reindex(result.index, method='ffill', limit=2)
assert_series_equal(result, expected)
def test_nearest_upsample_with_limit(self):
rng = date_range('1/1/2000', periods=3, freq='5t')
ts = Series(np.random.randn(len(rng)), rng)
result = ts.resample('t').nearest(limit=2)
expected = ts.reindex(result.index, method='nearest', limit=2)
assert_series_equal(result, expected)
def test_resample_ohlc(self):
s = self.series
grouper = TimeGrouper(Minute(5))
expect = s.groupby(grouper).agg(lambda x: x[-1])
result = s.resample('5Min').ohlc()
assert len(result) == len(expect)
assert len(result.columns) == 4
xs = result.iloc[-2]
assert xs['open'] == s[-6]
assert xs['high'] == s[-6:-1].max()
assert xs['low'] == s[-6:-1].min()
assert xs['close'] == s[-2]
xs = result.iloc[0]
assert xs['open'] == s[0]
assert xs['high'] == s[:5].max()
assert xs['low'] == s[:5].min()
assert xs['close'] == s[4]
def test_resample_ohlc_result(self):
# GH 12332
index = pd.date_range('1-1-2000', '2-15-2000', freq='h')
index = index.union(pd.date_range('4-15-2000', '5-15-2000', freq='h'))
s = Series(range(len(index)), index=index)
a = s.loc[:'4-15-2000'].resample('30T').ohlc()
assert isinstance(a, DataFrame)
b = s.loc[:'4-14-2000'].resample('30T').ohlc()
assert isinstance(b, DataFrame)
# GH12348
# raising on odd period
rng = date_range('2013-12-30', '2014-01-07')
index = rng.drop([Timestamp('2014-01-01'),
Timestamp('2013-12-31'),
Timestamp('2014-01-04'),
Timestamp('2014-01-05')])
df = DataFrame(data=np.arange(len(index)), index=index)
result = df.resample('B').mean()
expected = df.reindex(index=date_range(rng[0], rng[-1], freq='B'))
assert_frame_equal(result, expected)
def test_resample_ohlc_dataframe(self):
df = (
DataFrame({
'PRICE': {
Timestamp('2011-01-06 10:59:05', tz=None): 24990,
Timestamp('2011-01-06 12:43:33', tz=None): 25499,
Timestamp('2011-01-06 12:54:09', tz=None): 25499},
'VOLUME': {
Timestamp('2011-01-06 10:59:05', tz=None): 1500000000,
Timestamp('2011-01-06 12:43:33', tz=None): 5000000000,
Timestamp('2011-01-06 12:54:09', tz=None): 100000000}})
).reindex(['VOLUME', 'PRICE'], axis=1)
res = df.resample('H').ohlc()
exp = pd.concat([df['VOLUME'].resample('H').ohlc(),
df['PRICE'].resample('H').ohlc()],
axis=1,
keys=['VOLUME', 'PRICE'])
assert_frame_equal(exp, res)
df.columns = [['a', 'b'], ['c', 'd']]
res = df.resample('H').ohlc()
exp.columns = pd.MultiIndex.from_tuples([
('a', 'c', 'open'), ('a', 'c', 'high'), ('a', 'c', 'low'),
('a', 'c', 'close'), ('b', 'd', 'open'), ('b', 'd', 'high'),
('b', 'd', 'low'), ('b', 'd', 'close')])
assert_frame_equal(exp, res)
# dupe columns fail atm
# df.columns = ['PRICE', 'PRICE']
def test_resample_dup_index(self):
# GH 4812
# dup columns with resample raising
df = DataFrame(np.random.randn(4, 12), index=[2000, 2000, 2000, 2000],
columns=[Period(year=2000, month=i + 1, freq='M')
for i in range(12)])
df.iloc[3, :] = np.nan
result = df.resample('Q', axis=1).mean()
expected = df.groupby(lambda x: int((x.month - 1) / 3), axis=1).mean()
expected.columns = [
Period(year=2000, quarter=i + 1, freq='Q') for i in range(4)]
assert_frame_equal(result, expected)
def test_resample_reresample(self):
dti = DatetimeIndex(start=datetime(2005, 1, 1),
end=datetime(2005, 1, 10), freq='D')
s = Series(np.random.rand(len(dti)), dti)
bs = s.resample('B', closed='right', label='right').mean()
result = bs.resample('8H').mean()
assert len(result) == 22
assert isinstance(result.index.freq, offsets.DateOffset)
assert result.index.freq == offsets.Hour(8)
def test_resample_timestamp_to_period(self):
ts = _simple_ts('1/1/1990', '1/1/2000')
result = ts.resample('A-DEC', kind='period').mean()
expected = ts.resample('A-DEC').mean()
expected.index = period_range('1990', '2000', freq='a-dec')
assert_series_equal(result, expected)
result = ts.resample('A-JUN', kind='period').mean()
expected = ts.resample('A-JUN').mean()
expected.index = period_range('1990', '2000', freq='a-jun')
assert_series_equal(result, expected)
result = ts.resample('M', kind='period').mean()
expected = ts.resample('M').mean()
expected.index = period_range('1990-01', '2000-01', freq='M')
assert_series_equal(result, expected)
result = ts.resample('M', kind='period').mean()
expected = ts.resample('M').mean()
expected.index = period_range('1990-01', '2000-01', freq='M')
assert_series_equal(result, expected)
def test_ohlc_5min(self):
def _ohlc(group):
if isna(group).all():
return np.repeat(np.nan, 4)
return [group[0], group.max(), group.min(), group[-1]]
rng = date_range('1/1/2000 00:00:00', '1/1/2000 5:59:50', freq='10s')
ts = Series(np.random.randn(len(rng)), index=rng)
resampled = ts.resample('5min', closed='right',
label='right').ohlc()
assert (resampled.loc['1/1/2000 00:00'] == ts[0]).all()
exp = _ohlc(ts[1:31])
assert (resampled.loc['1/1/2000 00:05'] == exp).all()
exp = _ohlc(ts['1/1/2000 5:55:01':])
assert (resampled.loc['1/1/2000 6:00:00'] == exp).all()
def test_downsample_non_unique(self):
rng = date_range('1/1/2000', '2/29/2000')
rng2 = rng.repeat(5).values
ts = Series(np.random.randn(len(rng2)), index=rng2)
result = ts.resample('M').mean()
expected = ts.groupby(lambda x: x.month).mean()
assert len(result) == 2
assert_almost_equal(result[0], expected[1])
assert_almost_equal(result[1], expected[2])
def test_asfreq_non_unique(self):
# GH #1077
rng = date_range('1/1/2000', '2/29/2000')
rng2 = rng.repeat(2).values
ts = Series(np.random.randn(len(rng2)), index=rng2)
pytest.raises(Exception, ts.asfreq, 'B')
def test_resample_axis1(self):
rng = date_range('1/1/2000', '2/29/2000')
df = DataFrame(np.random.randn(3, len(rng)), columns=rng,
index=['a', 'b', 'c'])
result = df.resample('M', axis=1).mean()
expected = df.T.resample('M').mean().T
tm.assert_frame_equal(result, expected)
def test_resample_panel(self):
rng = date_range('1/1/2000', '6/30/2000')
n = len(rng)
with catch_warnings(record=True):
panel = Panel(np.random.randn(3, n, 5),
items=['one', 'two', 'three'],
major_axis=rng,
minor_axis=['a', 'b', 'c', 'd', 'e'])
result = panel.resample('M', axis=1).mean()
def p_apply(panel, f):
result = {}
for item in panel.items:
result[item] = f(panel[item])
return Panel(result, items=panel.items)
expected = p_apply(panel, lambda x: x.resample('M').mean())
tm.assert_panel_equal(result, expected)
panel2 = panel.swapaxes(1, 2)
result = panel2.resample('M', axis=2).mean()
expected = p_apply(panel2,
lambda x: x.resample('M', axis=1).mean())
tm.assert_panel_equal(result, expected)
def test_resample_panel_numpy(self):
rng = date_range('1/1/2000', '6/30/2000')
n = len(rng)
with catch_warnings(record=True):
panel = Panel(np.random.randn(3, n, 5),
items=['one', 'two', 'three'],
major_axis=rng,
minor_axis=['a', 'b', 'c', 'd', 'e'])
result = panel.resample('M', axis=1).apply(lambda x: x.mean(1))
expected = panel.resample('M', axis=1).mean()
tm.assert_panel_equal(result, expected)
panel = panel.swapaxes(1, 2)
result = panel.resample('M', axis=2).apply(lambda x: x.mean(2))
expected = panel.resample('M', axis=2).mean()
tm.assert_panel_equal(result, expected)
def test_resample_anchored_ticks(self):
# If a fixed delta (5 minute, 4 hour) evenly divides a day, we should
# "anchor" the origin at midnight so we get regular intervals rather
# than starting from the first timestamp which might start in the
# middle of a desired interval
rng = date_range('1/1/2000 04:00:00', periods=86400, freq='s')
ts = Series(np.random.randn(len(rng)), index=rng)
ts[:2] = np.nan # so results are the same
freqs = ['t', '5t', '15t', '30t', '4h', '12h']
for freq in freqs:
result = ts[2:].resample(freq, closed='left', label='left').mean()
expected = ts.resample(freq, closed='left', label='left').mean()
assert_series_equal(result, expected)
def test_resample_single_group(self):
mysum = lambda x: x.sum()
rng = date_range('2000-1-1', '2000-2-10', freq='D')
ts = Series(np.random.randn(len(rng)), index=rng)
assert_series_equal(ts.resample('M').sum(),
ts.resample('M').apply(mysum))
rng = date_range('2000-1-1', '2000-1-10', freq='D')
ts = Series(np.random.randn(len(rng)), index=rng)
assert_series_equal(ts.resample('M').sum(),
ts.resample('M').apply(mysum))
# GH 3849
s = Series([30.1, 31.6], index=[Timestamp('20070915 15:30:00'),
Timestamp('20070915 15:40:00')])
expected = Series([0.75], index=[Timestamp('20070915')])
result = s.resample('D').apply(lambda x: np.std(x))
assert_series_equal(result, expected)
def test_resample_base(self):
rng = date_range('1/1/2000 00:00:00', '1/1/2000 02:00', freq='s')
ts = Series(np.random.randn(len(rng)), index=rng)
resampled = ts.resample('5min', base=2).mean()
exp_rng = date_range('12/31/1999 23:57:00', '1/1/2000 01:57',
freq='5min')
tm.assert_index_equal(resampled.index, exp_rng)
def test_resample_base_with_timedeltaindex(self):
# GH 10530
rng = timedelta_range(start='0s', periods=25, freq='s')
ts = Series(np.random.randn(len(rng)), index=rng)
with_base = ts.resample('2s', base=5).mean()
without_base = ts.resample('2s').mean()
exp_without_base = timedelta_range(start='0s', end='25s', freq='2s')
exp_with_base = timedelta_range(start='5s', end='29s', freq='2s')
tm.assert_index_equal(without_base.index, exp_without_base)
tm.assert_index_equal(with_base.index, exp_with_base)
def test_resample_categorical_data_with_timedeltaindex(self):
# GH #12169
df = DataFrame({'Group_obj': 'A'},
index=pd.to_timedelta(list(range(20)), unit='s'))
df['Group'] = df['Group_obj'].astype('category')
result = df.resample('10s').agg(lambda x: (x.value_counts().index[0]))
expected = DataFrame({'Group_obj': ['A', 'A'],
'Group': ['A', 'A']},
index=pd.to_timedelta([0, 10], unit='s'))
expected = expected.reindex(['Group_obj', 'Group'], axis=1)
tm.assert_frame_equal(result, expected)
def test_resample_daily_anchored(self):
rng = date_range('1/1/2000 0:00:00', periods=10000, freq='T')
ts = Series(np.random.randn(len(rng)), index=rng)
ts[:2] = np.nan # so results are the same
result = ts[2:].resample('D', closed='left', label='left').mean()
expected = ts.resample('D', closed='left', label='left').mean()
assert_series_equal(result, expected)
def test_resample_to_period_monthly_buglet(self):
# GH #1259
rng = date_range('1/1/2000', '12/31/2000')
ts = Series(np.random.randn(len(rng)), index=rng)
result = ts.resample('M', kind='period').mean()
exp_index = period_range('Jan-2000', 'Dec-2000', freq='M')
tm.assert_index_equal(result.index, exp_index)
def test_period_with_agg(self):
# aggregate a period resampler with a lambda
s2 = Series(np.random.randint(0, 5, 50),
index=pd.period_range('2012-01-01', freq='H', periods=50),
dtype='float64')
expected = s2.to_timestamp().resample('D').mean().to_period()
result = s2.resample('D').agg(lambda x: x.mean())
assert_series_equal(result, expected)
def test_resample_segfault(self):
# GH 8573
# segfaulting in older versions
all_wins_and_wagers = [
(1, datetime(2013, 10, 1, 16, 20), 1, 0),
(2, datetime(2013, 10, 1, 16, 10), 1, 0),
(2, datetime(2013, 10, 1, 18, 15), 1, 0),
(2, datetime(2013, 10, 1, 16, 10, 31), 1, 0)]
df = DataFrame.from_records(all_wins_and_wagers,
columns=("ID", "timestamp", "A", "B")
).set_index("timestamp")
result = df.groupby("ID").resample("5min").sum()
expected = df.groupby("ID").apply(lambda x: x.resample("5min").sum())
assert_frame_equal(result, expected)
def test_resample_dtype_preservation(self):
# GH 12202
# validation tests for dtype preservation
df = DataFrame({'date': pd.date_range(start='2016-01-01',
periods=4, freq='W'),
'group': [1, 1, 2, 2],
'val': Series([5, 6, 7, 8],
dtype='int32')}
).set_index('date')
result = df.resample('1D').ffill()
assert result.val.dtype == np.int32
result = df.groupby('group').resample('1D').ffill()
assert result.val.dtype == np.int32
def test_resample_dtype_coerceion(self):
pytest.importorskip('scipy.interpolate')
# GH 16361
df = {"a": [1, 3, 1, 4]}
df = DataFrame(df, index=pd.date_range("2017-01-01", "2017-01-04"))
expected = (df.astype("float64")
.resample("H")
.mean()
["a"]
.interpolate("cubic")
)
result = df.resample("H")["a"].mean().interpolate("cubic")
tm.assert_series_equal(result, expected)
result = df.resample("H").mean()["a"].interpolate("cubic")
tm.assert_series_equal(result, expected)
def test_weekly_resample_buglet(self):
# #1327
rng = date_range('1/1/2000', freq='B', periods=20)
ts = Series(np.random.randn(len(rng)), index=rng)
resampled = ts.resample('W').mean()
expected = ts.resample('W-SUN').mean()
assert_series_equal(resampled, expected)
def test_monthly_resample_error(self):
# #1451
dates = date_range('4/16/2012 20:00', periods=5000, freq='h')
ts = Series(np.random.randn(len(dates)), index=dates)
# it works!
ts.resample('M')
def test_nanosecond_resample_error(self):
# GH 12307 - Values falls after last bin when
# Resampling using pd.tseries.offsets.Nano as period
start = 1443707890427
exp_start = 1443707890400
indx = pd.date_range(
start=pd.to_datetime(start),
periods=10,
freq='100n'
)
ts = Series(range(len(indx)), index=indx)
r = ts.resample(pd.tseries.offsets.Nano(100))
result = r.agg('mean')
exp_indx = pd.date_range(
start=pd.to_datetime(exp_start),
periods=10,
freq='100n'
)
exp = Series(range(len(exp_indx)), index=exp_indx)
assert_series_equal(result, exp)
def test_resample_anchored_intraday(self):
# #1471, #1458
rng = date_range('1/1/2012', '4/1/2012', freq='100min')
df = DataFrame(rng.month, index=rng)
result = df.resample('M').mean()
expected = df.resample(
'M', kind='period').mean().to_timestamp(how='end')
tm.assert_frame_equal(result, expected)
result = df.resample('M', closed='left').mean()
exp = df.tshift(1, freq='D').resample('M', kind='period').mean()
exp = exp.to_timestamp(how='end')
tm.assert_frame_equal(result, exp)
rng = date_range('1/1/2012', '4/1/2012', freq='100min')
df = DataFrame(rng.month, index=rng)
result = df.resample('Q').mean()
expected = df.resample(
'Q', kind='period').mean().to_timestamp(how='end')
tm.assert_frame_equal(result, expected)
result = df.resample('Q', closed='left').mean()
expected = df.tshift(1, freq='D').resample('Q', kind='period',
closed='left').mean()
expected = expected.to_timestamp(how='end')
tm.assert_frame_equal(result, expected)
ts = _simple_ts('2012-04-29 23:00', '2012-04-30 5:00', freq='h')
resampled = ts.resample('M').mean()
assert len(resampled) == 1
def test_resample_anchored_monthstart(self):
ts = _simple_ts('1/1/2000', '12/31/2002')
freqs = ['MS', 'BMS', 'QS-MAR', 'AS-DEC', 'AS-JUN']
for freq in freqs:
ts.resample(freq).mean()
def test_resample_anchored_multiday(self):
# When resampling a range spanning multiple days, ensure that the
# start date gets used to determine the offset. Fixes issue where
# a one day period is not a multiple of the frequency.
#
# See: https://github.com/pandas-dev/pandas/issues/8683
index = pd.date_range(
'2014-10-14 23:06:23.206', periods=3, freq='400L'
) | pd.date_range(
'2014-10-15 23:00:00', periods=2, freq='2200L')
s = Series(np.random.randn(5), index=index)
# Ensure left closing works
result = s.resample('2200L').mean()
assert result.index[-1] == Timestamp('2014-10-15 23:00:02.000')
# Ensure right closing works
result = s.resample('2200L', label='right').mean()
assert result.index[-1] == Timestamp('2014-10-15 23:00:04.200')
def test_corner_cases(self):
# miscellaneous test coverage
rng = date_range('1/1/2000', periods=12, freq='t')
ts = Series(np.random.randn(len(rng)), index=rng)
result = ts.resample('5t', closed='right', label='left').mean()
ex_index = date_range('1999-12-31 23:55', periods=4, freq='5t')
tm.assert_index_equal(result.index, ex_index)
len0pts = _simple_pts('2007-01', '2010-05', freq='M')[:0]
# it works
result = len0pts.resample('A-DEC').mean()
assert len(result) == 0
# resample to periods
ts = _simple_ts('2000-04-28', '2000-04-30 11:00', freq='h')
result = ts.resample('M', kind='period').mean()
assert len(result) == 1
assert result.index[0] == Period('2000-04', freq='M')
def test_anchored_lowercase_buglet(self):
dates = date_range('4/16/2012 20:00', periods=50000, freq='s')
ts = Series(np.random.randn(len(dates)), index=dates)
# it works!
ts.resample('d').mean()
def test_upsample_apply_functions(self):
# #1596
rng = pd.date_range('2012-06-12', periods=4, freq='h')
ts = Series(np.random.randn(len(rng)), index=rng)
result = ts.resample('20min').aggregate(['mean', 'sum'])
assert isinstance(result, DataFrame)
def test_resample_not_monotonic(self):
rng = pd.date_range('2012-06-12', periods=200, freq='h')
ts = Series(np.random.randn(len(rng)), index=rng)
ts = ts.take(np.random.permutation(len(ts)))
result = ts.resample('D').sum()
exp = ts.sort_index().resample('D').sum()
assert_series_equal(result, exp)
def test_resample_median_bug_1688(self):
for dtype in ['int64', 'int32', 'float64', 'float32']:
df = DataFrame([1, 2], index=[datetime(2012, 1, 1, 0, 0, 0),
datetime(2012, 1, 1, 0, 5, 0)],
dtype=dtype)
result = df.resample("T").apply(lambda x: x.mean())
exp = df.asfreq('T')
tm.assert_frame_equal(result, exp)
result = df.resample("T").median()
exp = df.asfreq('T')
tm.assert_frame_equal(result, exp)
def test_how_lambda_functions(self):
ts = _simple_ts('1/1/2000', '4/1/2000')
result = ts.resample('M').apply(lambda x: x.mean())
exp = ts.resample('M').mean()
tm.assert_series_equal(result, exp)
foo_exp = ts.resample('M').mean()
foo_exp.name = 'foo'
bar_exp = ts.resample('M').std()
bar_exp.name = 'bar'
result = ts.resample('M').apply(
[lambda x: x.mean(), lambda x: x.std(ddof=1)])
result.columns = ['foo', 'bar']
tm.assert_series_equal(result['foo'], foo_exp)
tm.assert_series_equal(result['bar'], bar_exp)
# this is a MI Series, so comparing the names of the results
# doesn't make sense
result = ts.resample('M').aggregate({'foo': lambda x: x.mean(),
'bar': lambda x: x.std(ddof=1)})
tm.assert_series_equal(result['foo'], foo_exp, check_names=False)
tm.assert_series_equal(result['bar'], bar_exp, check_names=False)
def test_resample_unequal_times(self):
# #1772
start = datetime(1999, 3, 1, 5)
# end hour is less than start
end = datetime(2012, 7, 31, 4)
bad_ind = date_range(start, end, freq="30min")
df = DataFrame({'close': 1}, index=bad_ind)
# it works!
df.resample('AS').sum()
def test_resample_consistency(self):
# GH 6418
# resample with bfill / limit / reindex consistency
i30 = pd.date_range('2002-02-02', periods=4, freq='30T')
s = Series(np.arange(4.), index=i30)
s[2] = np.NaN
# Upsample by factor 3 with reindex() and resample() methods:
i10 = pd.date_range(i30[0], i30[-1], freq='10T')
s10 = s.reindex(index=i10, method='bfill')
s10_2 = s.reindex(index=i10, method='bfill', limit=2)
rl = s.reindex_like(s10, method='bfill', limit=2)
r10_2 = s.resample('10Min').bfill(limit=2)
r10 = s.resample('10Min').bfill()
# s10_2, r10, r10_2, rl should all be equal
assert_series_equal(s10_2, r10)
assert_series_equal(s10_2, r10_2)
assert_series_equal(s10_2, rl)
def test_resample_timegrouper(self):
# GH 7227
dates1 = [datetime(2014, 10, 1), datetime(2014, 9, 3),
datetime(2014, 11, 5), datetime(2014, 9, 5),
datetime(2014, 10, 8), datetime(2014, 7, 15)]
dates2 = dates1[:2] + [pd.NaT] + dates1[2:4] + [pd.NaT] + dates1[4:]
dates3 = [pd.NaT] + dates1 + [pd.NaT]
for dates in [dates1, dates2, dates3]:
df = DataFrame(dict(A=dates, B=np.arange(len(dates))))
result = df.set_index('A').resample('M').count()
exp_idx = pd.DatetimeIndex(['2014-07-31', '2014-08-31',
'2014-09-30',
'2014-10-31', '2014-11-30'],
freq='M', name='A')
expected = DataFrame({'B': [1, 0, 2, 2, 1]}, index=exp_idx)
assert_frame_equal(result, expected)
result = df.groupby(pd.Grouper(freq='M', key='A')).count()
assert_frame_equal(result, expected)
df = DataFrame(dict(A=dates, B=np.arange(len(dates)), C=np.arange(
len(dates))))
result = df.set_index('A').resample('M').count()
expected = DataFrame({'B': [1, 0, 2, 2, 1], 'C': [1, 0, 2, 2, 1]},
index=exp_idx, columns=['B', 'C'])
assert_frame_equal(result, expected)
result = df.groupby(pd.Grouper(freq='M', key='A')).count()
assert_frame_equal(result, expected)
def test_resample_nunique(self):
# GH 12352
df = DataFrame({
'ID': {Timestamp('2015-06-05 00:00:00'): '0010100903',
Timestamp('2015-06-08 00:00:00'): '0010150847'},
'DATE': {Timestamp('2015-06-05 00:00:00'): '2015-06-05',
Timestamp('2015-06-08 00:00:00'): '2015-06-08'}})
r = df.resample('D')
g = df.groupby(pd.Grouper(freq='D'))
expected = df.groupby(pd.Grouper(freq='D')).ID.apply(lambda x:
x.nunique())
assert expected.name == 'ID'
for t in [r, g]:
result = r.ID.nunique()
assert_series_equal(result, expected)
result = df.ID.resample('D').nunique()
assert_series_equal(result, expected)
result = df.ID.groupby(pd.Grouper(freq='D')).nunique()
assert_series_equal(result, expected)
def test_resample_nunique_with_date_gap(self):
# GH 13453
index = pd.date_range('1-1-2000', '2-15-2000', freq='h')
index2 = pd.date_range('4-15-2000', '5-15-2000', freq='h')
index3 = index.append(index2)
s = Series(range(len(index3)), index=index3, dtype='int64')
r = s.resample('M')
# Since all elements are unique, these should all be the same
results = [
r.count(),
r.nunique(),
r.agg(Series.nunique),
r.agg('nunique')
]
assert_series_equal(results[0], results[1])
assert_series_equal(results[0], results[2])
assert_series_equal(results[0], results[3])
def test_resample_group_info(self): # GH10914
for n, k in product((10000, 100000), (10, 100, 1000)):
dr = date_range(start='2015-08-27', periods=n // 10, freq='T')
ts = Series(np.random.randint(0, n // k, n).astype('int64'),
index=np.random.choice(dr, n))
left = ts.resample('30T').nunique()
ix = date_range(start=ts.index.min(), end=ts.index.max(),
freq='30T')
vals = ts.values
bins = np.searchsorted(ix.values, ts.index, side='right')
sorter = np.lexsort((vals, bins))
vals, bins = vals[sorter], bins[sorter]
mask = np.r_[True, vals[1:] != vals[:-1]]
mask |= np.r_[True, bins[1:] != bins[:-1]]
arr = np.bincount(bins[mask] - 1,
minlength=len(ix)).astype('int64', copy=False)
right = Series(arr, index=ix)
assert_series_equal(left, right)
def test_resample_size(self):
n = 10000
dr = date_range('2015-09-19', periods=n, freq='T')
ts = Series(np.random.randn(n), index=np.random.choice(dr, n))
left = ts.resample('7T').size()
ix = date_range(start=left.index.min(), end=ts.index.max(), freq='7T')
bins = np.searchsorted(ix.values, ts.index.values, side='right')
val = np.bincount(bins, minlength=len(ix) + 1)[1:].astype('int64',
copy=False)
right = Series(val, index=ix)
assert_series_equal(left, right)
def test_resample_across_dst(self):
# The test resamples a DatetimeIndex with values before and after a
# DST change
# Issue: 14682
# The DatetimeIndex we will start with
# (note that DST happens at 03:00+02:00 -> 02:00+01:00)
# 2016-10-30 02:23:00+02:00, 2016-10-30 02:23:00+01:00
df1 = DataFrame([1477786980, 1477790580], columns=['ts'])
dti1 = DatetimeIndex(pd.to_datetime(df1.ts, unit='s')
.dt.tz_localize('UTC')
.dt.tz_convert('Europe/Madrid'))
# The expected DatetimeIndex after resampling.
# 2016-10-30 02:00:00+02:00, 2016-10-30 02:00:00+01:00
df2 = DataFrame([1477785600, 1477789200], columns=['ts'])
dti2 = DatetimeIndex(pd.to_datetime(df2.ts, unit='s')
.dt.tz_localize('UTC')
.dt.tz_convert('Europe/Madrid'))
df = DataFrame([5, 5], index=dti1)
result = df.resample(rule='H').sum()
expected = DataFrame([5, 5], index=dti2)
assert_frame_equal(result, expected)
def test_resample_dst_anchor(self):
# 5172
dti = DatetimeIndex([datetime(2012, 11, 4, 23)], tz='US/Eastern')
df = DataFrame([5], index=dti)
assert_frame_equal(df.resample(rule='D').sum(),
DataFrame([5], index=df.index.normalize()))
df.resample(rule='MS').sum()
assert_frame_equal(
df.resample(rule='MS').sum(),
DataFrame([5], index=DatetimeIndex([datetime(2012, 11, 1)],
tz='US/Eastern')))
dti = date_range('2013-09-30', '2013-11-02', freq='30Min',
tz='Europe/Paris')
values = range(dti.size)
df = DataFrame({"a": values,
"b": values,
"c": values}, index=dti, dtype='int64')
how = {"a": "min", "b": "max", "c": "count"}
assert_frame_equal(
df.resample("W-MON").agg(how)[["a", "b", "c"]],
DataFrame({"a": [0, 48, 384, 720, 1056, 1394],
"b": [47, 383, 719, 1055, 1393, 1586],
"c": [48, 336, 336, 336, 338, 193]},
index=date_range('9/30/2013', '11/4/2013',
freq='W-MON', tz='Europe/Paris')),
'W-MON Frequency')
assert_frame_equal(
df.resample("2W-MON").agg(how)[["a", "b", "c"]],
DataFrame({"a": [0, 48, 720, 1394],
"b": [47, 719, 1393, 1586],
"c": [48, 672, 674, 193]},
index=date_range('9/30/2013', '11/11/2013',
freq='2W-MON', tz='Europe/Paris')),
'2W-MON Frequency')
assert_frame_equal(
df.resample("MS").agg(how)[["a", "b", "c"]],
DataFrame({"a": [0, 48, 1538],
"b": [47, 1537, 1586],
"c": [48, 1490, 49]},
index=date_range('9/1/2013', '11/1/2013',
freq='MS', tz='Europe/Paris')),
'MS Frequency')
assert_frame_equal(
df.resample("2MS").agg(how)[["a", "b", "c"]],
DataFrame({"a": [0, 1538],
"b": [1537, 1586],
"c": [1538, 49]},
index=date_range('9/1/2013', '11/1/2013',
freq='2MS', tz='Europe/Paris')),
'2MS Frequency')
df_daily = df['10/26/2013':'10/29/2013']
assert_frame_equal(
df_daily.resample("D").agg({"a": "min", "b": "max", "c": "count"})
[["a", "b", "c"]],
DataFrame({"a": [1248, 1296, 1346, 1394],
"b": [1295, 1345, 1393, 1441],
"c": [48, 50, 48, 48]},
index=date_range('10/26/2013', '10/29/2013',
freq='D', tz='Europe/Paris')),
'D Frequency')
def test_resample_with_nat(self):
# GH 13020
index = DatetimeIndex([pd.NaT,
'1970-01-01 00:00:00',
pd.NaT,
'1970-01-01 00:00:01',
'1970-01-01 00:00:02'])
frame = DataFrame([2, 3, 5, 7, 11], index=index)
index_1s = DatetimeIndex(['1970-01-01 00:00:00',
'1970-01-01 00:00:01',
'1970-01-01 00:00:02'])
frame_1s = DataFrame([3, 7, 11], index=index_1s)
assert_frame_equal(frame.resample('1s').mean(), frame_1s)
index_2s = DatetimeIndex(['1970-01-01 00:00:00',
'1970-01-01 00:00:02'])
frame_2s = DataFrame([5, 11], index=index_2s)
assert_frame_equal(frame.resample('2s').mean(), frame_2s)
index_3s = DatetimeIndex(['1970-01-01 00:00:00'])
frame_3s = DataFrame([7], index=index_3s)
assert_frame_equal(frame.resample('3s').mean(), frame_3s)
assert_frame_equal(frame.resample('60s').mean(), frame_3s)
def test_resample_timedelta_values(self):
# GH 13119
# check that timedelta dtype is preserved when NaT values are
# introduced by the resampling
times = timedelta_range('1 day', '4 day', freq='4D')
df = DataFrame({'time': times}, index=times)
times2 = timedelta_range('1 day', '4 day', freq='2D')
exp = Series(times2, index=times2, name='time')
exp.iloc[1] = pd.NaT
res = df.resample('2D').first()['time']
tm.assert_series_equal(res, exp)
res = df['time'].resample('2D').first()
tm.assert_series_equal(res, exp)
def test_resample_datetime_values(self):
# GH 13119
# check that datetime dtype is preserved when NaT values are
# introduced by the resampling
dates = [datetime(2016, 1, 15), datetime(2016, 1, 19)]
df = DataFrame({'timestamp': dates}, index=dates)
exp = Series([datetime(2016, 1, 15), pd.NaT, datetime(2016, 1, 19)],
index=date_range('2016-01-15', periods=3, freq='2D'),
name='timestamp')
res = df.resample('2D').first()['timestamp']
tm.assert_series_equal(res, exp)
res = df['timestamp'].resample('2D').first()
tm.assert_series_equal(res, exp)
class TestPeriodIndex(Base):
_index_factory = lambda x: period_range
@pytest.fixture
def _series_name(self):
return 'pi'
def create_series(self):
# TODO: replace calls to .create_series() by injecting the series
# fixture
i = period_range(datetime(2005, 1, 1),
datetime(2005, 1, 10), freq='D')
return Series(np.arange(len(i)), index=i, name='pi')
@pytest.mark.parametrize('freq', ['2D', '1H', '2H'])
@pytest.mark.parametrize('kind', ['period', None, 'timestamp'])
def test_asfreq(self, series_and_frame, freq, kind):
# GH 12884, 15944
# make sure .asfreq() returns PeriodIndex (except kind='timestamp')
obj = series_and_frame
if kind == 'timestamp':
expected = obj.to_timestamp().resample(freq).asfreq()
else:
start = obj.index[0].to_timestamp(how='start')
end = (obj.index[-1] + 1).to_timestamp(how='start')
new_index = date_range(start=start, end=end, freq=freq,
closed='left')
expected = obj.to_timestamp().reindex(new_index).to_period(freq)
result = obj.resample(freq, kind=kind).asfreq()
assert_almost_equal(result, expected)
def test_asfreq_fill_value(self):
# test for fill value during resampling, issue 3715
s = self.create_series()
new_index = date_range(s.index[0].to_timestamp(how='start'),
(s.index[-1]).to_timestamp(how='start'),
freq='1H')
expected = s.to_timestamp().reindex(new_index, fill_value=4.0)
result = s.resample('1H', kind='timestamp').asfreq(fill_value=4.0)
assert_series_equal(result, expected)
frame = s.to_frame('value')
new_index = date_range(frame.index[0].to_timestamp(how='start'),
(frame.index[-1]).to_timestamp(how='start'),
freq='1H')
expected = frame.to_timestamp().reindex(new_index, fill_value=3.0)
result = frame.resample('1H', kind='timestamp').asfreq(fill_value=3.0)
assert_frame_equal(result, expected)
@pytest.mark.parametrize('freq', ['H', '12H', '2D', 'W'])
@pytest.mark.parametrize('kind', [None, 'period', 'timestamp'])
def test_selection(self, index, freq, kind):
# This is a bug, these should be implemented
# GH 14008
rng = np.arange(len(index), dtype=np.int64)
df = DataFrame({'date': index, 'a': rng},
index=pd.MultiIndex.from_arrays([rng, index],
names=['v', 'd']))
with pytest.raises(NotImplementedError):
df.resample(freq, on='date', kind=kind)
with pytest.raises(NotImplementedError):
df.resample(freq, level='d', kind=kind)
def test_annual_upsample_D_s_f(self):
self._check_annual_upsample_cases('D', 'start', 'ffill')
def test_annual_upsample_D_e_f(self):
self._check_annual_upsample_cases('D', 'end', 'ffill')
def test_annual_upsample_D_s_b(self):
self._check_annual_upsample_cases('D', 'start', 'bfill')
def test_annual_upsample_D_e_b(self):
self._check_annual_upsample_cases('D', 'end', 'bfill')
def test_annual_upsample_B_s_f(self):
self._check_annual_upsample_cases('B', 'start', 'ffill')
def test_annual_upsample_B_e_f(self):
self._check_annual_upsample_cases('B', 'end', 'ffill')
def test_annual_upsample_B_s_b(self):
self._check_annual_upsample_cases('B', 'start', 'bfill')
def test_annual_upsample_B_e_b(self):
self._check_annual_upsample_cases('B', 'end', 'bfill')
def test_annual_upsample_M_s_f(self):
self._check_annual_upsample_cases('M', 'start', 'ffill')
def test_annual_upsample_M_e_f(self):
self._check_annual_upsample_cases('M', 'end', 'ffill')
def test_annual_upsample_M_s_b(self):
self._check_annual_upsample_cases('M', 'start', 'bfill')
def test_annual_upsample_M_e_b(self):
self._check_annual_upsample_cases('M', 'end', 'bfill')
def _check_annual_upsample_cases(self, targ, conv, meth, end='12/31/1991'):
for month in MONTHS:
ts = _simple_pts('1/1/1990', end, freq='A-%s' % month)
result = getattr(ts.resample(targ, convention=conv), meth)()
expected = result.to_timestamp(targ, how=conv)
expected = expected.asfreq(targ, meth).to_period()
assert_series_equal(result, expected)
def test_basic_downsample(self):
ts = _simple_pts('1/1/1990', '6/30/1995', freq='M')
result = ts.resample('a-dec').mean()
expected = ts.groupby(ts.index.year).mean()
expected.index = period_range('1/1/1990', '6/30/1995', freq='a-dec')
assert_series_equal(result, expected)
# this is ok
assert_series_equal(ts.resample('a-dec').mean(), result)
assert_series_equal(ts.resample('a').mean(), result)
def test_not_subperiod(self):
# These are incompatible period rules for resampling
ts = _simple_pts('1/1/1990', '6/30/1995', freq='w-wed')
pytest.raises(ValueError, lambda: ts.resample('a-dec').mean())
pytest.raises(ValueError, lambda: ts.resample('q-mar').mean())
pytest.raises(ValueError, lambda: ts.resample('M').mean())
pytest.raises(ValueError, lambda: ts.resample('w-thu').mean())
@pytest.mark.parametrize('freq', ['D', '2D'])
def test_basic_upsample(self, freq):
ts = _simple_pts('1/1/1990', '6/30/1995', freq='M')
result = ts.resample('a-dec').mean()
resampled = result.resample(freq, convention='end').ffill()
expected = result.to_timestamp(freq, how='end')
expected = expected.asfreq(freq, 'ffill').to_period(freq)
assert_series_equal(resampled, expected)
def test_upsample_with_limit(self):
rng = period_range('1/1/2000', periods=5, freq='A')
ts = Series(np.random.randn(len(rng)), rng)
result = ts.resample('M', convention='end').ffill(limit=2)
expected = ts.asfreq('M').reindex(result.index, method='ffill',
limit=2)
assert_series_equal(result, expected)
def test_annual_upsample(self):
ts = _simple_pts('1/1/1990', '12/31/1995', freq='A-DEC')
df = DataFrame({'a': ts})
rdf = df.resample('D').ffill()
exp = df['a'].resample('D').ffill()
assert_series_equal(rdf['a'], exp)
rng = period_range('2000', '2003', freq='A-DEC')
ts = Series([1, 2, 3, 4], index=rng)
result = ts.resample('M').ffill()
ex_index = period_range('2000-01', '2003-12', freq='M')
expected = ts.asfreq('M', how='start').reindex(ex_index,
method='ffill')
assert_series_equal(result, expected)
def test_quarterly_upsample(self):
targets = ['D', 'B', 'M']
for month in MONTHS:
ts = _simple_pts('1/1/1990', '12/31/1995', freq='Q-%s' % month)
for targ, conv in product(targets, ['start', 'end']):
result = ts.resample(targ, convention=conv).ffill()
expected = result.to_timestamp(targ, how=conv)
expected = expected.asfreq(targ, 'ffill').to_period()
assert_series_equal(result, expected)
def test_monthly_upsample(self):
targets = ['D', 'B']
ts = _simple_pts('1/1/1990', '12/31/1995', freq='M')
for targ, conv in product(targets, ['start', 'end']):
result = ts.resample(targ, convention=conv).ffill()
expected = result.to_timestamp(targ, how=conv)
expected = expected.asfreq(targ, 'ffill').to_period()
assert_series_equal(result, expected)
def test_resample_basic(self):
# GH3609
s = Series(range(100), index=date_range(
'20130101', freq='s', periods=100, name='idx'), dtype='float')
s[10:30] = np.nan
index = PeriodIndex([
Period('2013-01-01 00:00', 'T'),
Period('2013-01-01 00:01', 'T')], name='idx')
expected = Series([34.5, 79.5], index=index)
result = s.to_period().resample('T', kind='period').mean()
assert_series_equal(result, expected)
result2 = s.resample('T', kind='period').mean()
assert_series_equal(result2, expected)
@pytest.mark.parametrize('freq,expected_vals', [('M', [31, 29, 31, 9]),
('2M', [31 + 29, 31 + 9])])
def test_resample_count(self, freq, expected_vals):
# GH12774
series = Series(1, index=pd.period_range(start='2000', periods=100))
result = series.resample(freq).count()
expected_index = pd.period_range(start='2000', freq=freq,
periods=len(expected_vals))
expected = Series(expected_vals, index=expected_index)
assert_series_equal(result, expected)
def test_resample_same_freq(self):
# GH12770
series = Series(range(3), index=pd.period_range(
start='2000', periods=3, freq='M'))
expected = series
for method in resample_methods:
result = getattr(series.resample('M'), method)()
assert_series_equal(result, expected)
def test_resample_incompat_freq(self):
with pytest.raises(IncompatibleFrequency):
Series(range(3), index=pd.period_range(
start='2000', periods=3, freq='M')).resample('W').mean()
def test_with_local_timezone_pytz(self):
# see gh-5430
local_timezone = pytz.timezone('America/Los_Angeles')
start = datetime(year=2013, month=11, day=1, hour=0, minute=0,
tzinfo=pytz.utc)
# 1 day later
end = datetime(year=2013, month=11, day=2, hour=0, minute=0,
tzinfo=pytz.utc)
index = pd.date_range(start, end, freq='H')
series = Series(1, index=index)
series = series.tz_convert(local_timezone)
result = series.resample('D', kind='period').mean()
# Create the expected series
# Index is moved back a day with the timezone conversion from UTC to
# Pacific
expected_index = (pd.period_range(start=start, end=end, freq='D') - 1)
expected = Series(1, index=expected_index)
assert_series_equal(result, expected)
def test_resample_with_pytz(self):
# GH 13238
s = Series(2, index=pd.date_range('2017-01-01', periods=48, freq="H",
tz="US/Eastern"))
result = s.resample("D").mean()
expected = Series(2, index=pd.DatetimeIndex(['2017-01-01',
'2017-01-02'],
tz="US/Eastern"))
assert_series_equal(result, expected)
# Especially assert that the timezone is LMT for pytz
assert result.index.tz == pytz.timezone('US/Eastern')
def test_with_local_timezone_dateutil(self):
# see gh-5430
local_timezone = 'dateutil/America/Los_Angeles'
start = datetime(year=2013, month=11, day=1, hour=0, minute=0,
tzinfo=dateutil.tz.tzutc())
# 1 day later
end = datetime(year=2013, month=11, day=2, hour=0, minute=0,
tzinfo=dateutil.tz.tzutc())
index = pd.date_range(start, end, freq='H', name='idx')
series = Series(1, index=index)
series = series.tz_convert(local_timezone)
result = series.resample('D', kind='period').mean()
# Create the expected series
# Index is moved back a day with the timezone conversion from UTC to
# Pacific
expected_index = (pd.period_range(start=start, end=end, freq='D',
name='idx') - 1)
expected = Series(1, index=expected_index)
assert_series_equal(result, expected)
def test_fill_method_and_how_upsample(self):
# GH2073
s = Series(np.arange(9, dtype='int64'),
index=date_range('2010-01-01', periods=9, freq='Q'))
last = s.resample('M').ffill()
both = s.resample('M').ffill().resample('M').last().astype('int64')
assert_series_equal(last, both)
def test_weekly_upsample(self):
targets = ['D', 'B']
for day in DAYS:
ts = _simple_pts('1/1/1990', '12/31/1995', freq='W-%s' % day)
for targ, conv in product(targets, ['start', 'end']):
result = ts.resample(targ, convention=conv).ffill()
expected = result.to_timestamp(targ, how=conv)
expected = expected.asfreq(targ, 'ffill').to_period()
assert_series_equal(result, expected)
def test_resample_to_timestamps(self):
ts = _simple_pts('1/1/1990', '12/31/1995', freq='M')
result = ts.resample('A-DEC', kind='timestamp').mean()
expected = ts.to_timestamp(how='end').resample('A-DEC').mean()
assert_series_equal(result, expected)
def test_resample_to_quarterly(self):
for month in MONTHS:
ts = _simple_pts('1990', '1992', freq='A-%s' % month)
quar_ts = ts.resample('Q-%s' % month).ffill()
stamps = ts.to_timestamp('D', how='start')
qdates = period_range(ts.index[0].asfreq('D', 'start'),
ts.index[-1].asfreq('D', 'end'),
freq='Q-%s' % month)
expected = stamps.reindex(qdates.to_timestamp('D', 's'),
method='ffill')
expected.index = qdates
assert_series_equal(quar_ts, expected)
# conforms, but different month
ts = _simple_pts('1990', '1992', freq='A-JUN')
for how in ['start', 'end']:
result = ts.resample('Q-MAR', convention=how).ffill()
expected = ts.asfreq('Q-MAR', how=how)
expected = expected.reindex(result.index, method='ffill')
# .to_timestamp('D')
# expected = expected.resample('Q-MAR').ffill()
assert_series_equal(result, expected)
def test_resample_fill_missing(self):
rng = PeriodIndex([2000, 2005, 2007, 2009], freq='A')
s = Series(np.random.randn(4), index=rng)
stamps = s.to_timestamp()
filled = s.resample('A').ffill()
expected = stamps.resample('A').ffill().to_period('A')
assert_series_equal(filled, expected)
def test_cant_fill_missing_dups(self):
rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq='A')
s = Series(np.random.randn(5), index=rng)
pytest.raises(Exception, lambda: s.resample('A').ffill())
@pytest.mark.parametrize('freq', ['5min'])
@pytest.mark.parametrize('kind', ['period', None, 'timestamp'])
def test_resample_5minute(self, freq, kind):
rng = period_range('1/1/2000', '1/5/2000', freq='T')
ts = Series(np.random.randn(len(rng)), index=rng)
expected = ts.to_timestamp().resample(freq).mean()
if kind != 'timestamp':
expected = expected.to_period(freq)
result = ts.resample(freq, kind=kind).mean()
assert_series_equal(result, expected)
def test_upsample_daily_business_daily(self):
ts = _simple_pts('1/1/2000', '2/1/2000', freq='B')
result = ts.resample('D').asfreq()
expected = ts.asfreq('D').reindex(period_range('1/3/2000', '2/1/2000'))
assert_series_equal(result, expected)
ts = _simple_pts('1/1/2000', '2/1/2000')
result = ts.resample('H', convention='s').asfreq()
exp_rng = period_range('1/1/2000', '2/1/2000 23:00', freq='H')
expected = ts.asfreq('H', how='s').reindex(exp_rng)
assert_series_equal(result, expected)
def test_resample_irregular_sparse(self):
dr = date_range(start='1/1/2012', freq='5min', periods=1000)
s = Series(np.array(100), index=dr)
# subset the data.
subset = s[:'2012-01-04 06:55']
result = subset.resample('10min').apply(len)
expected = s.resample('10min').apply(len).loc[result.index]
assert_series_equal(result, expected)
def test_resample_weekly_all_na(self):
rng = date_range('1/1/2000', periods=10, freq='W-WED')
ts = Series(np.random.randn(len(rng)), index=rng)
result = ts.resample('W-THU').asfreq()
assert result.isna().all()
result = ts.resample('W-THU').asfreq().ffill()[:-1]
expected = ts.asfreq('W-THU').ffill()
assert_series_equal(result, expected)
def test_resample_tz_localized(self):
dr = date_range(start='2012-4-13', end='2012-5-1')
ts = Series(lrange(len(dr)), dr)
ts_utc = ts.tz_localize('UTC')
ts_local = ts_utc.tz_convert('America/Los_Angeles')
result = ts_local.resample('W').mean()
ts_local_naive = ts_local.copy()
ts_local_naive.index = [x.replace(tzinfo=None)
for x in ts_local_naive.index.to_pydatetime()]
exp = ts_local_naive.resample(
'W').mean().tz_localize('America/Los_Angeles')
assert_series_equal(result, exp)
# it works
result = ts_local.resample('D').mean()
# #2245
idx = date_range('2001-09-20 15:59', '2001-09-20 16:00', freq='T',
tz='Australia/Sydney')
s = Series([1, 2], index=idx)
result = s.resample('D', closed='right', label='right').mean()
ex_index = date_range('2001-09-21', periods=1, freq='D',
tz='Australia/Sydney')
expected = Series([1.5], index=ex_index)
assert_series_equal(result, expected)
# for good measure
result = s.resample('D', kind='period').mean()
ex_index = period_range('2001-09-20', periods=1, freq='D')
expected = Series([1.5], index=ex_index)
assert_series_equal(result, expected)
# GH 6397
# comparing an offset that doesn't propagate tz's
rng = date_range('1/1/2011', periods=20000, freq='H')
rng = rng.tz_localize('EST')
ts = DataFrame(index=rng)
ts['first'] = np.random.randn(len(rng))
ts['second'] = np.cumsum(np.random.randn(len(rng)))
expected = DataFrame(
{
'first': ts.resample('A').sum()['first'],
'second': ts.resample('A').mean()['second']},
columns=['first', 'second'])
result = ts.resample(
'A').agg({'first': np.sum,
'second': np.mean}).reindex(columns=['first', 'second'])
assert_frame_equal(result, expected)
def test_closed_left_corner(self):
# #1465
s = Series(np.random.randn(21),
index=date_range(start='1/1/2012 9:30',
freq='1min', periods=21))
s[0] = np.nan
result = s.resample('10min', closed='left', label='right').mean()
exp = s[1:].resample('10min', closed='left', label='right').mean()
assert_series_equal(result, exp)
result = s.resample('10min', closed='left', label='left').mean()
exp = s[1:].resample('10min', closed='left', label='left').mean()
ex_index = date_range(start='1/1/2012 9:30', freq='10min', periods=3)
tm.assert_index_equal(result.index, ex_index)
assert_series_equal(result, exp)
def test_quarterly_resampling(self):
rng = period_range('2000Q1', periods=10, freq='Q-DEC')
ts = Series(np.arange(10), index=rng)
result = ts.resample('A').mean()
exp = ts.to_timestamp().resample('A').mean().to_period()
assert_series_equal(result, exp)
def test_resample_weekly_bug_1726(self):
# 8/6/12 is a Monday
ind = DatetimeIndex(start="8/6/2012", end="8/26/2012", freq="D")
n = len(ind)
data = [[x] * 5 for x in range(n)]
df = DataFrame(data, columns=['open', 'high', 'low', 'close', 'vol'],
index=ind)
# it works!
df.resample('W-MON', closed='left', label='left').first()
def test_resample_with_dst_time_change(self):
# GH 15549
index = pd.DatetimeIndex([1457537600000000000, 1458059600000000000],
tz='UTC').tz_convert('America/Chicago')
df = pd.DataFrame([1, 2], index=index)
result = df.resample('12h', closed='right',
label='right').last().ffill()
expected_index_values = ['2016-03-09 12:00:00-06:00',
'2016-03-10 00:00:00-06:00',
'2016-03-10 12:00:00-06:00',
'2016-03-11 00:00:00-06:00',
'2016-03-11 12:00:00-06:00',
'2016-03-12 00:00:00-06:00',
'2016-03-12 12:00:00-06:00',
'2016-03-13 00:00:00-06:00',
'2016-03-13 13:00:00-05:00',
'2016-03-14 01:00:00-05:00',
'2016-03-14 13:00:00-05:00',
'2016-03-15 01:00:00-05:00',
'2016-03-15 13:00:00-05:00']
index = pd.DatetimeIndex(expected_index_values,
tz='UTC').tz_convert('America/Chicago')
expected = pd.DataFrame([1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 2.0], index=index)
assert_frame_equal(result, expected)
def test_resample_bms_2752(self):
# GH2753
foo = Series(index=pd.bdate_range('20000101', '20000201'))
res1 = foo.resample("BMS").mean()
res2 = foo.resample("BMS").mean().resample("B").mean()
assert res1.index[0] == Timestamp('20000103')
assert res1.index[0] == res2.index[0]
# def test_monthly_convention_span(self):
# rng = period_range('2000-01', periods=3, freq='M')
# ts = Series(np.arange(3), index=rng)
# # hacky way to get same thing
# exp_index = period_range('2000-01-01', '2000-03-31', freq='D')
# expected = ts.asfreq('D', how='end').reindex(exp_index)
# expected = expected.fillna(method='bfill')
# result = ts.resample('D', convention='span').mean()
# assert_series_equal(result, expected)
def test_default_right_closed_label(self):
end_freq = ['D', 'Q', 'M', 'D']
end_types = ['M', 'A', 'Q', 'W']
for from_freq, to_freq in zip(end_freq, end_types):
idx = DatetimeIndex(start='8/15/2012', periods=100, freq=from_freq)
df = DataFrame(np.random.randn(len(idx), 2), idx)
resampled = df.resample(to_freq).mean()
assert_frame_equal(resampled, df.resample(to_freq, closed='right',
label='right').mean())
def test_default_left_closed_label(self):
others = ['MS', 'AS', 'QS', 'D', 'H']
others_freq = ['D', 'Q', 'M', 'H', 'T']
for from_freq, to_freq in zip(others_freq, others):
idx = DatetimeIndex(start='8/15/2012', periods=100, freq=from_freq)
df = DataFrame(np.random.randn(len(idx), 2), idx)
resampled = df.resample(to_freq).mean()
assert_frame_equal(resampled, df.resample(to_freq, closed='left',
label='left').mean())
def test_all_values_single_bin(self):
# 2070
index = period_range(start="2012-01-01", end="2012-12-31", freq="M")
s = Series(np.random.randn(len(index)), index=index)
result = s.resample("A").mean()
tm.assert_almost_equal(result[0], s.mean())
def test_evenly_divisible_with_no_extra_bins(self):
# 4076
# when the frequency is evenly divisible, sometimes extra bins
df = DataFrame(np.random.randn(9, 3),
index=date_range('2000-1-1', periods=9))
result = df.resample('5D').mean()
expected = pd.concat(
[df.iloc[0:5].mean(), df.iloc[5:].mean()], axis=1).T
expected.index = [Timestamp('2000-1-1'), Timestamp('2000-1-6')]
assert_frame_equal(result, expected)
index = date_range(start='2001-5-4', periods=28)
df = DataFrame(
[{'REST_KEY': 1, 'DLY_TRN_QT': 80, 'DLY_SLS_AMT': 90,
'COOP_DLY_TRN_QT': 30, 'COOP_DLY_SLS_AMT': 20}] * 28 +
[{'REST_KEY': 2, 'DLY_TRN_QT': 70, 'DLY_SLS_AMT': 10,
'COOP_DLY_TRN_QT': 50, 'COOP_DLY_SLS_AMT': 20}] * 28,
index=index.append(index)).sort_index()
index = date_range('2001-5-4', periods=4, freq='7D')
expected = DataFrame(
[{'REST_KEY': 14, 'DLY_TRN_QT': 14, 'DLY_SLS_AMT': 14,
'COOP_DLY_TRN_QT': 14, 'COOP_DLY_SLS_AMT': 14}] * 4,
index=index)
result = df.resample('7D').count()
assert_frame_equal(result, expected)
expected = DataFrame(
[{'REST_KEY': 21, 'DLY_TRN_QT': 1050, 'DLY_SLS_AMT': 700,
'COOP_DLY_TRN_QT': 560, 'COOP_DLY_SLS_AMT': 280}] * 4,
index=index)
result = df.resample('7D').sum()
assert_frame_equal(result, expected)
@pytest.mark.parametrize('kind', ['period', None, 'timestamp'])
@pytest.mark.parametrize('agg_arg', ['mean', {'value': 'mean'}, ['mean']])
def test_loffset_returns_datetimeindex(self, frame, kind, agg_arg):
# make sure passing loffset returns DatetimeIndex in all cases
# basic method taken from Base.test_resample_loffset_arg_type()
df = frame
expected_means = [df.values[i:i + 2].mean()
for i in range(0, len(df.values), 2)]
expected_index = self.create_index(df.index[0],
periods=len(df.index) / 2,
freq='2D')
# loffset coerces PeriodIndex to DateTimeIndex
expected_index = expected_index.to_timestamp()
expected_index += timedelta(hours=2)
expected = DataFrame({'value': expected_means}, index=expected_index)
result_agg = df.resample('2D', loffset='2H', kind=kind).agg(agg_arg)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result_how = df.resample('2D', how=agg_arg, loffset='2H',
kind=kind)
if isinstance(agg_arg, list):
expected.columns = pd.MultiIndex.from_tuples([('value', 'mean')])
assert_frame_equal(result_agg, expected)
assert_frame_equal(result_how, expected)
@pytest.mark.parametrize('freq, period_mult', [('H', 24), ('12H', 2)])
@pytest.mark.parametrize('kind', [None, 'period'])
def test_upsampling_ohlc(self, freq, period_mult, kind):
# GH 13083
pi = PeriodIndex(start='2000', freq='D', periods=10)
s = Series(range(len(pi)), index=pi)
expected = s.to_timestamp().resample(freq).ohlc().to_period(freq)
# timestamp-based resampling doesn't include all sub-periods
# of the last original period, so extend accordingly:
new_index = PeriodIndex(start='2000', freq=freq,
periods=period_mult * len(pi))
expected = expected.reindex(new_index)
result = s.resample(freq, kind=kind).ohlc()
assert_frame_equal(result, expected)
@pytest.mark.parametrize('periods, values',
[([pd.NaT, '1970-01-01 00:00:00', pd.NaT,
'1970-01-01 00:00:02', '1970-01-01 00:00:03'],
[2, 3, 5, 7, 11]),
([pd.NaT, pd.NaT, '1970-01-01 00:00:00', pd.NaT,
pd.NaT, pd.NaT, '1970-01-01 00:00:02',
'1970-01-01 00:00:03', pd.NaT, pd.NaT],
[1, 2, 3, 5, 6, 8, 7, 11, 12, 13])])
@pytest.mark.parametrize('freq, expected_values',
[('1s', [3, np.NaN, 7, 11]),
('2s', [3, int((7 + 11) / 2)]),
('3s', [int((3 + 7) / 2), 11])])
def test_resample_with_nat(self, periods, values, freq, expected_values):
# GH 13224
index = PeriodIndex(periods, freq='S')
frame = DataFrame(values, index=index)
expected_index = period_range('1970-01-01 00:00:00',
periods=len(expected_values), freq=freq)
expected = DataFrame(expected_values, index=expected_index)
result = frame.resample(freq).mean()
assert_frame_equal(result, expected)
def test_resample_with_only_nat(self):
# GH 13224
pi = PeriodIndex([pd.NaT] * 3, freq='S')
frame = DataFrame([2, 3, 5], index=pi)
expected_index = PeriodIndex(data=[], freq=pi.freq)
expected = DataFrame([], index=expected_index)
result = frame.resample('1s').mean()
assert_frame_equal(result, expected)
class TestTimedeltaIndex(Base):
_index_factory = lambda x: timedelta_range
@pytest.fixture
def _index_start(self):
return '1 day'
@pytest.fixture
def _index_end(self):
return '10 day'
@pytest.fixture
def _series_name(self):
return 'tdi'
def create_series(self):
i = timedelta_range('1 day',
'10 day', freq='D')
return Series(np.arange(len(i)), index=i, name='tdi')
def test_asfreq_bug(self):
import datetime as dt
df = DataFrame(data=[1, 3],
index=[dt.timedelta(), dt.timedelta(minutes=3)])
result = df.resample('1T').asfreq()
expected = DataFrame(data=[1, np.nan, np.nan, 3],
index=timedelta_range('0 day',
periods=4,
freq='1T'))
assert_frame_equal(result, expected)
class TestResamplerGrouper(object):
def setup_method(self, method):
self.frame = DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8,
'B': np.arange(40)},
index=date_range('1/1/2000',
freq='s',
periods=40))
def test_tab_complete_ipython6_warning(self, ip):
from IPython.core.completer import provisionalcompleter
code = dedent("""\
import pandas.util.testing as tm
s = tm.makeTimeSeries()
rs = s.resample("D")
""")
ip.run_code(code)
with tm.assert_produces_warning(None):
with provisionalcompleter('ignore'):
list(ip.Completer.completions('rs.', 1))
def test_deferred_with_groupby(self):
# GH 12486
# support deferred resample ops with groupby
data = [['2010-01-01', 'A', 2], ['2010-01-02', 'A', 3],
['2010-01-05', 'A', 8], ['2010-01-10', 'A', 7],
['2010-01-13', 'A', 3], ['2010-01-01', 'B', 5],
['2010-01-03', 'B', 2], ['2010-01-04', 'B', 1],
['2010-01-11', 'B', 7], ['2010-01-14', 'B', 3]]
df = DataFrame(data, columns=['date', 'id', 'score'])
df.date = pd.to_datetime(df.date)
f = lambda x: x.set_index('date').resample('D').asfreq()
expected = df.groupby('id').apply(f)
result = df.set_index('date').groupby('id').resample('D').asfreq()
assert_frame_equal(result, expected)
df = DataFrame({'date': pd.date_range(start='2016-01-01',
periods=4,
freq='W'),
'group': [1, 1, 2, 2],
'val': [5, 6, 7, 8]}).set_index('date')
f = lambda x: x.resample('1D').ffill()
expected = df.groupby('group').apply(f)
result = df.groupby('group').resample('1D').ffill()
assert_frame_equal(result, expected)
def test_getitem(self):
g = self.frame.groupby('A')
expected = g.B.apply(lambda x: x.resample('2s').mean())
result = g.resample('2s').B.mean()
assert_series_equal(result, expected)
result = g.B.resample('2s').mean()
assert_series_equal(result, expected)
result = g.resample('2s').mean().B
assert_series_equal(result, expected)
def test_getitem_multiple(self):
# GH 13174
# multiple calls after selection causing an issue with aliasing
data = [{'id': 1, 'buyer': 'A'}, {'id': 2, 'buyer': 'B'}]
df = DataFrame(data, index=pd.date_range('2016-01-01', periods=2))
r = df.groupby('id').resample('1D')
result = r['buyer'].count()
expected = Series([1, 1],
index=pd.MultiIndex.from_tuples(
[(1, Timestamp('2016-01-01')),
(2, Timestamp('2016-01-02'))],
names=['id', None]),
name='buyer')
assert_series_equal(result, expected)
result = r['buyer'].count()
assert_series_equal(result, expected)
def test_groupby_resample_on_api_with_getitem(self):
# GH 17813
df = pd.DataFrame({'id': list('aabbb'),
'date': pd.date_range('1-1-2016', periods=5),
'data': 1})
exp = df.set_index('date').groupby('id').resample('2D')['data'].sum()
result = df.groupby('id').resample('2D', on='date')['data'].sum()
assert_series_equal(result, exp)
def test_nearest(self):
# GH 17496
# Resample nearest
index = pd.date_range('1/1/2000', periods=3, freq='T')
result = Series(range(3), index=index).resample('20s').nearest()
expected = Series(
[0, 0, 1, 1, 1, 2, 2],
index=pd.DatetimeIndex(
['2000-01-01 00:00:00', '2000-01-01 00:00:20',
'2000-01-01 00:00:40', '2000-01-01 00:01:00',
'2000-01-01 00:01:20', '2000-01-01 00:01:40',
'2000-01-01 00:02:00'],
dtype='datetime64[ns]',
freq='20S'))
assert_series_equal(result, expected)
def test_methods(self):
g = self.frame.groupby('A')
r = g.resample('2s')
for f in ['first', 'last', 'median', 'sem', 'sum', 'mean',
'min', 'max']:
result = getattr(r, f)()
expected = g.apply(lambda x: getattr(x.resample('2s'), f)())
assert_frame_equal(result, expected)
for f in ['size']:
result = getattr(r, f)()
expected = g.apply(lambda x: getattr(x.resample('2s'), f)())
assert_series_equal(result, expected)
for f in ['count']:
result = getattr(r, f)()
expected = g.apply(lambda x: getattr(x.resample('2s'), f)())
assert_frame_equal(result, expected)
# series only
for f in ['nunique']:
result = getattr(r.B, f)()
expected = g.B.apply(lambda x: getattr(x.resample('2s'), f)())
assert_series_equal(result, expected)
for f in ['nearest', 'backfill', 'ffill', 'asfreq']:
result = getattr(r, f)()
expected = g.apply(lambda x: getattr(x.resample('2s'), f)())
assert_frame_equal(result, expected)
result = r.ohlc()
expected = g.apply(lambda x: x.resample('2s').ohlc())
assert_frame_equal(result, expected)
for f in ['std', 'var']:
result = getattr(r, f)(ddof=1)
expected = g.apply(lambda x: getattr(x.resample('2s'), f)(ddof=1))
assert_frame_equal(result, expected)
def test_apply(self):
g = self.frame.groupby('A')
r = g.resample('2s')
# reduction
expected = g.resample('2s').sum()
def f(x):
return x.resample('2s').sum()
result = r.apply(f)
assert_frame_equal(result, expected)
def f(x):
return x.resample('2s').apply(lambda y: y.sum())
result = g.apply(f)
assert_frame_equal(result, expected)
def test_apply_with_mutated_index(self):
# GH 15169
index = pd.date_range('1-1-2015', '12-31-15', freq='D')
df = DataFrame(data={'col1': np.random.rand(len(index))}, index=index)
def f(x):
s = Series([1, 2], index=['a', 'b'])
return s
expected = df.groupby(pd.Grouper(freq='M')).apply(f)
result = df.resample('M').apply(f)
assert_frame_equal(result, expected)
# A case for series
expected = df['col1'].groupby(pd.Grouper(freq='M')).apply(f)
result = df['col1'].resample('M').apply(f)
assert_series_equal(result, expected)
def test_resample_groupby_with_label(self):
# GH 13235
index = date_range('2000-01-01', freq='2D', periods=5)
df = DataFrame(index=index,
data={'col0': [0, 0, 1, 1, 2], 'col1': [1, 1, 1, 1, 1]}
)
result = df.groupby('col0').resample('1W', label='left').sum()
mi = [np.array([0, 0, 1, 2]),
pd.to_datetime(np.array(['1999-12-26', '2000-01-02',
'2000-01-02', '2000-01-02'])
)
]
mindex = pd.MultiIndex.from_arrays(mi, names=['col0', None])
expected = DataFrame(data={'col0': [0, 0, 2, 2], 'col1': [1, 1, 2, 1]},
index=mindex
)
assert_frame_equal(result, expected)
def test_consistency_with_window(self):
# consistent return values with window
df = self.frame
expected = pd.Int64Index([1, 2, 3], name='A')
result = df.groupby('A').resample('2s').mean()
assert result.index.nlevels == 2
tm.assert_index_equal(result.index.levels[0], expected)
result = df.groupby('A').rolling(20).mean()
assert result.index.nlevels == 2
tm.assert_index_equal(result.index.levels[0], expected)
def test_median_duplicate_columns(self):
# GH 14233
df = DataFrame(np.random.randn(20, 3),
columns=list('aaa'),
index=pd.date_range('2012-01-01', periods=20, freq='s'))
df2 = df.copy()
df2.columns = ['a', 'b', 'c']
expected = df2.resample('5s').median()
result = df.resample('5s').median()
expected.columns = result.columns
assert_frame_equal(result, expected)
class TestTimeGrouper(object):
def setup_method(self, method):
self.ts = Series(np.random.randn(1000),
index=date_range('1/1/2000', periods=1000))
def test_apply(self):
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
grouper = pd.TimeGrouper(freq='A', label='right', closed='right')
grouped = self.ts.groupby(grouper)
f = lambda x: x.sort_values()[-3:]
applied = grouped.apply(f)
expected = self.ts.groupby(lambda x: x.year).apply(f)
applied.index = applied.index.droplevel(0)
expected.index = expected.index.droplevel(0)
assert_series_equal(applied, expected)
def test_count(self):
self.ts[::3] = np.nan
expected = self.ts.groupby(lambda x: x.year).count()
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
grouper = pd.TimeGrouper(freq='A', label='right', closed='right')
result = self.ts.groupby(grouper).count()
expected.index = result.index
assert_series_equal(result, expected)
result = self.ts.resample('A').count()
expected.index = result.index
assert_series_equal(result, expected)
def test_numpy_reduction(self):
result = self.ts.resample('A', closed='right').prod()
expected = self.ts.groupby(lambda x: x.year).agg(np.prod)
expected.index = result.index
assert_series_equal(result, expected)
def test_apply_iteration(self):
# #2300
N = 1000
ind = pd.date_range(start="2000-01-01", freq="D", periods=N)
df = DataFrame({'open': 1, 'close': 2}, index=ind)
tg = TimeGrouper('M')
_, grouper, _ = tg._get_grouper(df)
# Errors
grouped = df.groupby(grouper, group_keys=False)
f = lambda df: df['close'] / df['open']
# it works!
result = grouped.apply(f)
tm.assert_index_equal(result.index, df.index)
def test_panel_aggregation(self):
ind = pd.date_range('1/1/2000', periods=100)
data = np.random.randn(2, len(ind), 4)
with catch_warnings(record=True):
wp = Panel(data, items=['Item1', 'Item2'], major_axis=ind,
minor_axis=['A', 'B', 'C', 'D'])
tg = TimeGrouper('M', axis=1)
_, grouper, _ = tg._get_grouper(wp)
bingrouped = wp.groupby(grouper)
binagg = bingrouped.mean()
def f(x):
assert (isinstance(x, Panel))
return x.mean(1)
result = bingrouped.agg(f)
tm.assert_panel_equal(result, binagg)
def test_fails_on_no_datetime_index(self):
index_names = ('Int64Index', 'Index', 'Float64Index', 'MultiIndex')
index_funcs = (tm.makeIntIndex,
tm.makeUnicodeIndex, tm.makeFloatIndex,
lambda m: tm.makeCustomIndex(m, 2))
n = 2
for name, func in zip(index_names, index_funcs):
index = func(n)
df = DataFrame({'a': np.random.randn(n)}, index=index)
with tm.assert_raises_regex(TypeError,
"Only valid with "
"DatetimeIndex, TimedeltaIndex "
"or PeriodIndex, but got an "
"instance of %r" % name):
df.groupby(TimeGrouper('D'))
def test_aaa_group_order(self):
# GH 12840
# check TimeGrouper perform stable sorts
n = 20
data = np.random.randn(n, 4)
df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2),
datetime(2013, 1, 3), datetime(2013, 1, 4),
datetime(2013, 1, 5)] * 4
grouped = df.groupby(TimeGrouper(key='key', freq='D'))
tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)),
df[::5])
tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 2)),
df[1::5])
tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 3)),
df[2::5])
tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 4)),
df[3::5])
tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)),
df[4::5])
def test_aggregate_normal(self):
# check TimeGrouper's aggregation is identical as normal groupby
n = 20
data = np.random.randn(n, 4)
normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
normal_df['key'] = [1, 2, 3, 4, 5] * 4
dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2),
datetime(2013, 1, 3), datetime(2013, 1, 4),
datetime(2013, 1, 5)] * 4
normal_grouped = normal_df.groupby('key')
dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))
for func in ['min', 'max', 'prod', 'var', 'std', 'mean']:
expected = getattr(normal_grouped, func)()
dt_result = getattr(dt_grouped, func)()
expected.index = date_range(start='2013-01-01', freq='D',
periods=5, name='key')
assert_frame_equal(expected, dt_result)
for func in ['count', 'sum']:
expected = getattr(normal_grouped, func)()
expected.index = date_range(start='2013-01-01', freq='D',
periods=5, name='key')
dt_result = getattr(dt_grouped, func)()
assert_frame_equal(expected, dt_result)
# GH 7453
for func in ['size']:
expected = getattr(normal_grouped, func)()
expected.index = date_range(start='2013-01-01', freq='D',
periods=5, name='key')
dt_result = getattr(dt_grouped, func)()
assert_series_equal(expected, dt_result)
# GH 7453
for func in ['first', 'last']:
expected = getattr(normal_grouped, func)()
expected.index = date_range(start='2013-01-01', freq='D',
periods=5, name='key')
dt_result = getattr(dt_grouped, func)()
assert_frame_equal(expected, dt_result)
# if TimeGrouper is used included, 'nth' doesn't work yet
"""
for func in ['nth']:
expected = getattr(normal_grouped, func)(3)
expected.index = date_range(start='2013-01-01',
freq='D', periods=5, name='key')
dt_result = getattr(dt_grouped, func)(3)
assert_frame_equal(expected, dt_result)
"""
@pytest.mark.parametrize('method, unit', [
('sum', 0),
('prod', 1),
])
def test_resample_entirly_nat_window(self, method, unit):
s = pd.Series([0] * 2 + [np.nan] * 2,
index=pd.date_range('2017', periods=4))
# 0 / 1 by default
result = methodcaller(method)(s.resample("2d"))
expected = pd.Series([0.0, unit],
index=pd.to_datetime(['2017-01-01',
'2017-01-03']))
tm.assert_series_equal(result, expected)
# min_count=0
result = methodcaller(method, min_count=0)(s.resample("2d"))
expected = pd.Series([0.0, unit],
index=pd.to_datetime(['2017-01-01',
'2017-01-03']))
tm.assert_series_equal(result, expected)
# min_count=1
result = methodcaller(method, min_count=1)(s.resample("2d"))
expected = pd.Series([0.0, np.nan],
index=pd.to_datetime(['2017-01-01',
'2017-01-03']))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize('func, fill_value', [
('min', np.nan),
('max', np.nan),
('sum', 0),
('prod', 1),
('count', 0),
])
def test_aggregate_with_nat(self, func, fill_value):
# check TimeGrouper's aggregation is identical as normal groupby
# if NaT is included, 'var', 'std', 'mean', 'first','last'
# and 'nth' doesn't work yet
n = 20
data = np.random.randn(n, 4).astype('int64')
normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
normal_df['key'] = [1, 2, np.nan, 4, 5] * 4
dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT,
datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4
normal_grouped = normal_df.groupby('key')
dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))
normal_result = getattr(normal_grouped, func)()
dt_result = getattr(dt_grouped, func)()
pad = DataFrame([[fill_value] * 4], index=[3],
columns=['A', 'B', 'C', 'D'])
expected = normal_result.append(pad)
expected = expected.sort_index()
expected.index = date_range(start='2013-01-01', freq='D',
periods=5, name='key')
assert_frame_equal(expected, dt_result)
assert dt_result.index.name == 'key'
def test_aggregate_with_nat_size(self):
# GH 9925
n = 20
data = np.random.randn(n, 4).astype('int64')
normal_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
normal_df['key'] = [1, 2, np.nan, 4, 5] * 4
dt_df = DataFrame(data, columns=['A', 'B', 'C', 'D'])
dt_df['key'] = [datetime(2013, 1, 1), datetime(2013, 1, 2), pd.NaT,
datetime(2013, 1, 4), datetime(2013, 1, 5)] * 4
normal_grouped = normal_df.groupby('key')
dt_grouped = dt_df.groupby(TimeGrouper(key='key', freq='D'))
normal_result = normal_grouped.size()
dt_result = dt_grouped.size()
pad = Series([0], index=[3])
expected = normal_result.append(pad)
expected = expected.sort_index()
expected.index = date_range(start='2013-01-01', freq='D',
periods=5, name='key')
assert_series_equal(expected, dt_result)
assert dt_result.index.name == 'key'
def test_repr(self):
# GH18203
result = repr(TimeGrouper(key='A', freq='H'))
expected = ("TimeGrouper(key='A', freq=<Hour>, axis=0, sort=True, "
"closed='left', label='left', how='mean', "
"convention='e', base=0)")
assert result == expected
@pytest.mark.parametrize('method, unit', [
('sum', 0),
('prod', 1),
])
def test_upsample_sum(self, method, unit):
s = pd.Series(1, index=pd.date_range("2017", periods=2, freq="H"))
resampled = s.resample("30T")
index = pd.to_datetime(['2017-01-01T00:00:00',
'2017-01-01T00:30:00',
'2017-01-01T01:00:00'])
# 0 / 1 by default
result = methodcaller(method)(resampled)
expected = pd.Series([1, unit, 1], index=index)
tm.assert_series_equal(result, expected)
# min_count=0
result = methodcaller(method, min_count=0)(resampled)
expected = pd.Series([1, unit, 1], index=index)
tm.assert_series_equal(result, expected)
# min_count=1
result = methodcaller(method, min_count=1)(resampled)
expected = pd.Series([1, np.nan, 1], index=index)
tm.assert_series_equal(result, expected)
# min_count>1
result = methodcaller(method, min_count=2)(resampled)
expected = pd.Series([np.nan, np.nan, np.nan], index=index)
tm.assert_series_equal(result, expected)