653 lines
26 KiB
Python
653 lines
26 KiB
Python
|
""" test with the TimeGrouper / grouping with datetimes """
|
||
|
|
||
|
import pytest
|
||
|
import pytz
|
||
|
|
||
|
from datetime import datetime
|
||
|
import numpy as np
|
||
|
from numpy import nan
|
||
|
|
||
|
import pandas as pd
|
||
|
from pandas import (DataFrame, date_range, Index,
|
||
|
Series, MultiIndex, Timestamp, DatetimeIndex)
|
||
|
from pandas.compat import StringIO
|
||
|
from pandas.util import testing as tm
|
||
|
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
||
|
|
||
|
|
||
|
class TestGroupBy(object):
|
||
|
|
||
|
def test_groupby_with_timegrouper(self):
|
||
|
# GH 4161
|
||
|
# TimeGrouper requires a sorted index
|
||
|
# also verifies that the resultant index has the correct name
|
||
|
df_original = DataFrame({
|
||
|
'Buyer': 'Carl Carl Carl Carl Joe Carl'.split(),
|
||
|
'Quantity': [18, 3, 5, 1, 9, 3],
|
||
|
'Date': [
|
||
|
datetime(2013, 9, 1, 13, 0),
|
||
|
datetime(2013, 9, 1, 13, 5),
|
||
|
datetime(2013, 10, 1, 20, 0),
|
||
|
datetime(2013, 10, 3, 10, 0),
|
||
|
datetime(2013, 12, 2, 12, 0),
|
||
|
datetime(2013, 9, 2, 14, 0),
|
||
|
]
|
||
|
})
|
||
|
|
||
|
# GH 6908 change target column's order
|
||
|
df_reordered = df_original.sort_values(by='Quantity')
|
||
|
|
||
|
for df in [df_original, df_reordered]:
|
||
|
df = df.set_index(['Date'])
|
||
|
|
||
|
expected = DataFrame(
|
||
|
{'Quantity': 0},
|
||
|
index=date_range('20130901 13:00:00',
|
||
|
'20131205 13:00:00', freq='5D',
|
||
|
name='Date', closed='left'))
|
||
|
expected.iloc[[0, 6, 18], 0] = np.array([24, 6, 9], dtype='int64')
|
||
|
|
||
|
result1 = df.resample('5D') .sum()
|
||
|
assert_frame_equal(result1, expected)
|
||
|
|
||
|
df_sorted = df.sort_index()
|
||
|
result2 = df_sorted.groupby(pd.Grouper(freq='5D')).sum()
|
||
|
assert_frame_equal(result2, expected)
|
||
|
|
||
|
result3 = df.groupby(pd.Grouper(freq='5D')).sum()
|
||
|
assert_frame_equal(result3, expected)
|
||
|
|
||
|
@pytest.mark.parametrize("should_sort", [True, False])
|
||
|
def test_groupby_with_timegrouper_methods(self, should_sort):
|
||
|
# GH 3881
|
||
|
# make sure API of timegrouper conforms
|
||
|
|
||
|
df = pd.DataFrame({
|
||
|
'Branch': 'A A A A A B'.split(),
|
||
|
'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(),
|
||
|
'Quantity': [1, 3, 5, 8, 9, 3],
|
||
|
'Date': [
|
||
|
datetime(2013, 1, 1, 13, 0),
|
||
|
datetime(2013, 1, 1, 13, 5),
|
||
|
datetime(2013, 10, 1, 20, 0),
|
||
|
datetime(2013, 10, 2, 10, 0),
|
||
|
datetime(2013, 12, 2, 12, 0),
|
||
|
datetime(2013, 12, 2, 14, 0),
|
||
|
]
|
||
|
})
|
||
|
|
||
|
if should_sort:
|
||
|
df = df.sort_values(by='Quantity', ascending=False)
|
||
|
|
||
|
df = df.set_index('Date', drop=False)
|
||
|
g = df.groupby(pd.Grouper(freq='6M'))
|
||
|
assert g.group_keys
|
||
|
|
||
|
import pandas.core.groupby.groupby
|
||
|
assert isinstance(g.grouper, pandas.core.groupby.groupby.BinGrouper)
|
||
|
groups = g.groups
|
||
|
assert isinstance(groups, dict)
|
||
|
assert len(groups) == 3
|
||
|
|
||
|
def test_timegrouper_with_reg_groups(self):
|
||
|
|
||
|
# GH 3794
|
||
|
# allow combinateion of timegrouper/reg groups
|
||
|
|
||
|
df_original = DataFrame({
|
||
|
'Branch': 'A A A A A A A B'.split(),
|
||
|
'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
|
||
|
'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
|
||
|
'Date': [
|
||
|
datetime(2013, 1, 1, 13, 0),
|
||
|
datetime(2013, 1, 1, 13, 5),
|
||
|
datetime(2013, 10, 1, 20, 0),
|
||
|
datetime(2013, 10, 2, 10, 0),
|
||
|
datetime(2013, 10, 1, 20, 0),
|
||
|
datetime(2013, 10, 2, 10, 0),
|
||
|
datetime(2013, 12, 2, 12, 0),
|
||
|
datetime(2013, 12, 2, 14, 0),
|
||
|
]
|
||
|
}).set_index('Date')
|
||
|
|
||
|
df_sorted = df_original.sort_values(by='Quantity', ascending=False)
|
||
|
|
||
|
for df in [df_original, df_sorted]:
|
||
|
expected = DataFrame({
|
||
|
'Buyer': 'Carl Joe Mark'.split(),
|
||
|
'Quantity': [10, 18, 3],
|
||
|
'Date': [
|
||
|
datetime(2013, 12, 31, 0, 0),
|
||
|
datetime(2013, 12, 31, 0, 0),
|
||
|
datetime(2013, 12, 31, 0, 0),
|
||
|
]
|
||
|
}).set_index(['Date', 'Buyer'])
|
||
|
|
||
|
result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
expected = DataFrame({
|
||
|
'Buyer': 'Carl Mark Carl Joe'.split(),
|
||
|
'Quantity': [1, 3, 9, 18],
|
||
|
'Date': [
|
||
|
datetime(2013, 1, 1, 0, 0),
|
||
|
datetime(2013, 1, 1, 0, 0),
|
||
|
datetime(2013, 7, 1, 0, 0),
|
||
|
datetime(2013, 7, 1, 0, 0),
|
||
|
]
|
||
|
}).set_index(['Date', 'Buyer'])
|
||
|
result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
df_original = DataFrame({
|
||
|
'Branch': 'A A A A A A A B'.split(),
|
||
|
'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
|
||
|
'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
|
||
|
'Date': [
|
||
|
datetime(2013, 10, 1, 13, 0),
|
||
|
datetime(2013, 10, 1, 13, 5),
|
||
|
datetime(2013, 10, 1, 20, 0),
|
||
|
datetime(2013, 10, 2, 10, 0),
|
||
|
datetime(2013, 10, 1, 20, 0),
|
||
|
datetime(2013, 10, 2, 10, 0),
|
||
|
datetime(2013, 10, 2, 12, 0),
|
||
|
datetime(2013, 10, 2, 14, 0),
|
||
|
]
|
||
|
}).set_index('Date')
|
||
|
|
||
|
df_sorted = df_original.sort_values(by='Quantity', ascending=False)
|
||
|
for df in [df_original, df_sorted]:
|
||
|
|
||
|
expected = DataFrame({
|
||
|
'Buyer': 'Carl Joe Mark Carl Joe'.split(),
|
||
|
'Quantity': [6, 8, 3, 4, 10],
|
||
|
'Date': [
|
||
|
datetime(2013, 10, 1, 0, 0),
|
||
|
datetime(2013, 10, 1, 0, 0),
|
||
|
datetime(2013, 10, 1, 0, 0),
|
||
|
datetime(2013, 10, 2, 0, 0),
|
||
|
datetime(2013, 10, 2, 0, 0),
|
||
|
]
|
||
|
}).set_index(['Date', 'Buyer'])
|
||
|
|
||
|
result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum()
|
||
|
expected = DataFrame({
|
||
|
'Buyer': 'Carl Joe Mark'.split(),
|
||
|
'Quantity': [10, 18, 3],
|
||
|
'Date': [
|
||
|
datetime(2013, 10, 31, 0, 0),
|
||
|
datetime(2013, 10, 31, 0, 0),
|
||
|
datetime(2013, 10, 31, 0, 0),
|
||
|
]
|
||
|
}).set_index(['Date', 'Buyer'])
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# passing the name
|
||
|
df = df.reset_index()
|
||
|
result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
|
||
|
]).sum()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
with pytest.raises(KeyError):
|
||
|
df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum()
|
||
|
|
||
|
# passing the level
|
||
|
df = df.set_index('Date')
|
||
|
result = df.groupby([pd.Grouper(freq='1M', level='Date'), 'Buyer'
|
||
|
]).sum()
|
||
|
assert_frame_equal(result, expected)
|
||
|
result = df.groupby([pd.Grouper(freq='1M', level=0), 'Buyer']).sum(
|
||
|
)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
with pytest.raises(ValueError):
|
||
|
df.groupby([pd.Grouper(freq='1M', level='foo'),
|
||
|
'Buyer']).sum()
|
||
|
|
||
|
# multi names
|
||
|
df = df.copy()
|
||
|
df['Date'] = df.index + pd.offsets.MonthEnd(2)
|
||
|
result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer'
|
||
|
]).sum()
|
||
|
expected = DataFrame({
|
||
|
'Buyer': 'Carl Joe Mark'.split(),
|
||
|
'Quantity': [10, 18, 3],
|
||
|
'Date': [
|
||
|
datetime(2013, 11, 30, 0, 0),
|
||
|
datetime(2013, 11, 30, 0, 0),
|
||
|
datetime(2013, 11, 30, 0, 0),
|
||
|
]
|
||
|
}).set_index(['Date', 'Buyer'])
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# error as we have both a level and a name!
|
||
|
with pytest.raises(ValueError):
|
||
|
df.groupby([pd.Grouper(freq='1M', key='Date',
|
||
|
level='Date'), 'Buyer']).sum()
|
||
|
|
||
|
# single groupers
|
||
|
expected = DataFrame({'Quantity': [31],
|
||
|
'Date': [datetime(2013, 10, 31, 0, 0)
|
||
|
]}).set_index('Date')
|
||
|
result = df.groupby(pd.Grouper(freq='1M')).sum()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.groupby([pd.Grouper(freq='1M')]).sum()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
expected = DataFrame({'Quantity': [31],
|
||
|
'Date': [datetime(2013, 11, 30, 0, 0)
|
||
|
]}).set_index('Date')
|
||
|
result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize('freq', ['D', 'M', 'A', 'Q-APR'])
|
||
|
def test_timegrouper_with_reg_groups_freq(self, freq):
|
||
|
# GH 6764 multiple grouping with/without sort
|
||
|
df = DataFrame({
|
||
|
'date': pd.to_datetime([
|
||
|
'20121002', '20121007', '20130130', '20130202', '20130305',
|
||
|
'20121002', '20121207', '20130130', '20130202', '20130305',
|
||
|
'20130202', '20130305'
|
||
|
]),
|
||
|
'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
|
||
|
'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301,
|
||
|
359, 801],
|
||
|
'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12]
|
||
|
}).set_index('date')
|
||
|
|
||
|
expected = (
|
||
|
df.groupby('user_id')['whole_cost']
|
||
|
.resample(freq)
|
||
|
.sum(min_count=1) # XXX
|
||
|
.dropna()
|
||
|
.reorder_levels(['date', 'user_id'])
|
||
|
.sort_index()
|
||
|
.astype('int64')
|
||
|
)
|
||
|
expected.name = 'whole_cost'
|
||
|
|
||
|
result1 = df.sort_index().groupby([pd.Grouper(freq=freq),
|
||
|
'user_id'])['whole_cost'].sum()
|
||
|
assert_series_equal(result1, expected)
|
||
|
|
||
|
result2 = df.groupby([pd.Grouper(freq=freq), 'user_id'])[
|
||
|
'whole_cost'].sum()
|
||
|
assert_series_equal(result2, expected)
|
||
|
|
||
|
def test_timegrouper_get_group(self):
|
||
|
# GH 6914
|
||
|
|
||
|
df_original = DataFrame({
|
||
|
'Buyer': 'Carl Joe Joe Carl Joe Carl'.split(),
|
||
|
'Quantity': [18, 3, 5, 1, 9, 3],
|
||
|
'Date': [datetime(2013, 9, 1, 13, 0),
|
||
|
datetime(2013, 9, 1, 13, 5),
|
||
|
datetime(2013, 10, 1, 20, 0),
|
||
|
datetime(2013, 10, 3, 10, 0),
|
||
|
datetime(2013, 12, 2, 12, 0),
|
||
|
datetime(2013, 9, 2, 14, 0), ]
|
||
|
})
|
||
|
df_reordered = df_original.sort_values(by='Quantity')
|
||
|
|
||
|
# single grouping
|
||
|
expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
|
||
|
df_original.iloc[[4]]]
|
||
|
dt_list = ['2013-09-30', '2013-10-31', '2013-12-31']
|
||
|
|
||
|
for df in [df_original, df_reordered]:
|
||
|
grouped = df.groupby(pd.Grouper(freq='M', key='Date'))
|
||
|
for t, expected in zip(dt_list, expected_list):
|
||
|
dt = pd.Timestamp(t)
|
||
|
result = grouped.get_group(dt)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# multiple grouping
|
||
|
expected_list = [df_original.iloc[[1]], df_original.iloc[[3]],
|
||
|
df_original.iloc[[4]]]
|
||
|
g_list = [('Joe', '2013-09-30'), ('Carl', '2013-10-31'),
|
||
|
('Joe', '2013-12-31')]
|
||
|
|
||
|
for df in [df_original, df_reordered]:
|
||
|
grouped = df.groupby(['Buyer', pd.Grouper(freq='M', key='Date')])
|
||
|
for (b, t), expected in zip(g_list, expected_list):
|
||
|
dt = pd.Timestamp(t)
|
||
|
result = grouped.get_group((b, dt))
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# with index
|
||
|
df_original = df_original.set_index('Date')
|
||
|
df_reordered = df_original.sort_values(by='Quantity')
|
||
|
|
||
|
expected_list = [df_original.iloc[[0, 1, 5]], df_original.iloc[[2, 3]],
|
||
|
df_original.iloc[[4]]]
|
||
|
|
||
|
for df in [df_original, df_reordered]:
|
||
|
grouped = df.groupby(pd.Grouper(freq='M'))
|
||
|
for t, expected in zip(dt_list, expected_list):
|
||
|
dt = pd.Timestamp(t)
|
||
|
result = grouped.get_group(dt)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_timegrouper_apply_return_type_series(self):
|
||
|
# Using `apply` with the `TimeGrouper` should give the
|
||
|
# same return type as an `apply` with a `Grouper`.
|
||
|
# Issue #11742
|
||
|
df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'],
|
||
|
'value': [10, 13]})
|
||
|
df_dt = df.copy()
|
||
|
df_dt['date'] = pd.to_datetime(df_dt['date'])
|
||
|
|
||
|
def sumfunc_series(x):
|
||
|
return pd.Series([x['value'].sum()], ('sum',))
|
||
|
|
||
|
expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_series)
|
||
|
result = (df_dt.groupby(pd.Grouper(freq='M', key='date'))
|
||
|
.apply(sumfunc_series))
|
||
|
assert_frame_equal(result.reset_index(drop=True),
|
||
|
expected.reset_index(drop=True))
|
||
|
|
||
|
def test_timegrouper_apply_return_type_value(self):
|
||
|
# Using `apply` with the `TimeGrouper` should give the
|
||
|
# same return type as an `apply` with a `Grouper`.
|
||
|
# Issue #11742
|
||
|
df = pd.DataFrame({'date': ['10/10/2000', '11/10/2000'],
|
||
|
'value': [10, 13]})
|
||
|
df_dt = df.copy()
|
||
|
df_dt['date'] = pd.to_datetime(df_dt['date'])
|
||
|
|
||
|
def sumfunc_value(x):
|
||
|
return x.value.sum()
|
||
|
|
||
|
expected = df.groupby(pd.Grouper(key='date')).apply(sumfunc_value)
|
||
|
with tm.assert_produces_warning(FutureWarning,
|
||
|
check_stacklevel=False):
|
||
|
result = (df_dt.groupby(pd.TimeGrouper(freq='M', key='date'))
|
||
|
.apply(sumfunc_value))
|
||
|
assert_series_equal(result.reset_index(drop=True),
|
||
|
expected.reset_index(drop=True))
|
||
|
|
||
|
def test_groupby_groups_datetimeindex(self):
|
||
|
# #1430
|
||
|
periods = 1000
|
||
|
ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods)
|
||
|
df = DataFrame({'high': np.arange(periods),
|
||
|
'low': np.arange(periods)}, index=ind)
|
||
|
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
|
||
|
|
||
|
# it works!
|
||
|
groups = grouped.groups
|
||
|
assert isinstance(list(groups.keys())[0], datetime)
|
||
|
|
||
|
# GH 11442
|
||
|
index = pd.date_range('2015/01/01', periods=5, name='date')
|
||
|
df = pd.DataFrame({'A': [5, 6, 7, 8, 9],
|
||
|
'B': [1, 2, 3, 4, 5]}, index=index)
|
||
|
result = df.groupby(level='date').groups
|
||
|
dates = ['2015-01-05', '2015-01-04', '2015-01-03',
|
||
|
'2015-01-02', '2015-01-01']
|
||
|
expected = {pd.Timestamp(date): pd.DatetimeIndex([date], name='date')
|
||
|
for date in dates}
|
||
|
tm.assert_dict_equal(result, expected)
|
||
|
|
||
|
grouped = df.groupby(level='date')
|
||
|
for date in dates:
|
||
|
result = grouped.get_group(date)
|
||
|
data = [[df.loc[date, 'A'], df.loc[date, 'B']]]
|
||
|
expected_index = pd.DatetimeIndex([date], name='date')
|
||
|
expected = pd.DataFrame(data,
|
||
|
columns=list('AB'),
|
||
|
index=expected_index)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_groupby_groups_datetimeindex_tz(self):
|
||
|
# GH 3950
|
||
|
dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
|
||
|
'2011-07-19 09:00:00', '2011-07-19 07:00:00',
|
||
|
'2011-07-19 08:00:00', '2011-07-19 09:00:00']
|
||
|
df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
|
||
|
'datetime': dates,
|
||
|
'value1': np.arange(6, dtype='int64'),
|
||
|
'value2': [1, 2] * 3})
|
||
|
df['datetime'] = df['datetime'].apply(
|
||
|
lambda d: Timestamp(d, tz='US/Pacific'))
|
||
|
|
||
|
exp_idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00',
|
||
|
'2011-07-19 07:00:00',
|
||
|
'2011-07-19 08:00:00',
|
||
|
'2011-07-19 08:00:00',
|
||
|
'2011-07-19 09:00:00',
|
||
|
'2011-07-19 09:00:00'],
|
||
|
tz='US/Pacific', name='datetime')
|
||
|
exp_idx2 = Index(['a', 'b'] * 3, name='label')
|
||
|
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||
|
expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5],
|
||
|
'value2': [1, 2, 2, 1, 1, 2]},
|
||
|
index=exp_idx, columns=['value1', 'value2'])
|
||
|
|
||
|
result = df.groupby(['datetime', 'label']).sum()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# by level
|
||
|
didx = pd.DatetimeIndex(dates, tz='Asia/Tokyo')
|
||
|
df = DataFrame({'value1': np.arange(6, dtype='int64'),
|
||
|
'value2': [1, 2, 3, 1, 2, 3]},
|
||
|
index=didx)
|
||
|
|
||
|
exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00',
|
||
|
'2011-07-19 08:00:00',
|
||
|
'2011-07-19 09:00:00'], tz='Asia/Tokyo')
|
||
|
expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]},
|
||
|
index=exp_idx, columns=['value1', 'value2'])
|
||
|
|
||
|
result = df.groupby(level=0).sum()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_frame_datetime64_handling_groupby(self):
|
||
|
# it works!
|
||
|
df = DataFrame([(3, np.datetime64('2012-07-03')),
|
||
|
(3, np.datetime64('2012-07-04'))],
|
||
|
columns=['a', 'date'])
|
||
|
result = df.groupby('a').first()
|
||
|
assert result['date'][3] == Timestamp('2012-07-03')
|
||
|
|
||
|
def test_groupby_multi_timezone(self):
|
||
|
|
||
|
# combining multiple / different timezones yields UTC
|
||
|
|
||
|
data = """0,2000-01-28 16:47:00,America/Chicago
|
||
|
1,2000-01-29 16:48:00,America/Chicago
|
||
|
2,2000-01-30 16:49:00,America/Los_Angeles
|
||
|
3,2000-01-31 16:50:00,America/Chicago
|
||
|
4,2000-01-01 16:50:00,America/New_York"""
|
||
|
|
||
|
df = pd.read_csv(StringIO(data), header=None,
|
||
|
names=['value', 'date', 'tz'])
|
||
|
result = df.groupby('tz').date.apply(
|
||
|
lambda x: pd.to_datetime(x).dt.tz_localize(x.name))
|
||
|
|
||
|
expected = Series([Timestamp('2000-01-28 16:47:00-0600',
|
||
|
tz='America/Chicago'),
|
||
|
Timestamp('2000-01-29 16:48:00-0600',
|
||
|
tz='America/Chicago'),
|
||
|
Timestamp('2000-01-30 16:49:00-0800',
|
||
|
tz='America/Los_Angeles'),
|
||
|
Timestamp('2000-01-31 16:50:00-0600',
|
||
|
tz='America/Chicago'),
|
||
|
Timestamp('2000-01-01 16:50:00-0500',
|
||
|
tz='America/New_York')],
|
||
|
name='date',
|
||
|
dtype=object)
|
||
|
assert_series_equal(result, expected)
|
||
|
|
||
|
tz = 'America/Chicago'
|
||
|
res_values = df.groupby('tz').date.get_group(tz)
|
||
|
result = pd.to_datetime(res_values).dt.tz_localize(tz)
|
||
|
exp_values = Series(['2000-01-28 16:47:00', '2000-01-29 16:48:00',
|
||
|
'2000-01-31 16:50:00'],
|
||
|
index=[0, 1, 3], name='date')
|
||
|
expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
|
||
|
assert_series_equal(result, expected)
|
||
|
|
||
|
def test_groupby_groups_periods(self):
|
||
|
dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
|
||
|
'2011-07-19 09:00:00', '2011-07-19 07:00:00',
|
||
|
'2011-07-19 08:00:00', '2011-07-19 09:00:00']
|
||
|
df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
|
||
|
'period': [pd.Period(d, freq='H') for d in dates],
|
||
|
'value1': np.arange(6, dtype='int64'),
|
||
|
'value2': [1, 2] * 3})
|
||
|
|
||
|
exp_idx1 = pd.PeriodIndex(['2011-07-19 07:00:00',
|
||
|
'2011-07-19 07:00:00',
|
||
|
'2011-07-19 08:00:00',
|
||
|
'2011-07-19 08:00:00',
|
||
|
'2011-07-19 09:00:00',
|
||
|
'2011-07-19 09:00:00'],
|
||
|
freq='H', name='period')
|
||
|
exp_idx2 = Index(['a', 'b'] * 3, name='label')
|
||
|
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
|
||
|
expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5],
|
||
|
'value2': [1, 2, 2, 1, 1, 2]},
|
||
|
index=exp_idx, columns=['value1', 'value2'])
|
||
|
|
||
|
result = df.groupby(['period', 'label']).sum()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# by level
|
||
|
didx = pd.PeriodIndex(dates, freq='H')
|
||
|
df = DataFrame({'value1': np.arange(6, dtype='int64'),
|
||
|
'value2': [1, 2, 3, 1, 2, 3]},
|
||
|
index=didx)
|
||
|
|
||
|
exp_idx = pd.PeriodIndex(['2011-07-19 07:00:00',
|
||
|
'2011-07-19 08:00:00',
|
||
|
'2011-07-19 09:00:00'], freq='H')
|
||
|
expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]},
|
||
|
index=exp_idx, columns=['value1', 'value2'])
|
||
|
|
||
|
result = df.groupby(level=0).sum()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_groupby_first_datetime64(self):
|
||
|
df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
|
||
|
df[1] = df[1].view('M8[ns]')
|
||
|
|
||
|
assert issubclass(df[1].dtype.type, np.datetime64)
|
||
|
|
||
|
result = df.groupby(level=0).first()
|
||
|
got_dt = result[1].dtype
|
||
|
assert issubclass(got_dt.type, np.datetime64)
|
||
|
|
||
|
result = df[1].groupby(level=0).first()
|
||
|
got_dt = result.dtype
|
||
|
assert issubclass(got_dt.type, np.datetime64)
|
||
|
|
||
|
def test_groupby_max_datetime64(self):
|
||
|
# GH 5869
|
||
|
# datetimelike dtype conversion from int
|
||
|
df = DataFrame(dict(A=Timestamp('20130101'), B=np.arange(5)))
|
||
|
expected = df.groupby('A')['A'].apply(lambda x: x.max())
|
||
|
result = df.groupby('A')['A'].max()
|
||
|
assert_series_equal(result, expected)
|
||
|
|
||
|
def test_groupby_datetime64_32_bit(self):
|
||
|
# GH 6410 / numpy 4328
|
||
|
# 32-bit under 1.9-dev indexing issue
|
||
|
|
||
|
df = DataFrame({"A": range(2), "B": [pd.Timestamp('2000-01-1')] * 2})
|
||
|
result = df.groupby("A")["B"].transform(min)
|
||
|
expected = Series([pd.Timestamp('2000-01-1')] * 2, name='B')
|
||
|
assert_series_equal(result, expected)
|
||
|
|
||
|
def test_groupby_with_timezone_selection(self):
|
||
|
# GH 11616
|
||
|
# Test that column selection returns output in correct timezone.
|
||
|
np.random.seed(42)
|
||
|
df = pd.DataFrame({
|
||
|
'factor': np.random.randint(0, 3, size=60),
|
||
|
'time': pd.date_range('01/01/2000 00:00', periods=60,
|
||
|
freq='s', tz='UTC')
|
||
|
})
|
||
|
df1 = df.groupby('factor').max()['time']
|
||
|
df2 = df.groupby('factor')['time'].max()
|
||
|
tm.assert_series_equal(df1, df2)
|
||
|
|
||
|
def test_timezone_info(self):
|
||
|
# see gh-11682: Timezone info lost when broadcasting
|
||
|
# scalar datetime to DataFrame
|
||
|
|
||
|
df = pd.DataFrame({'a': [1], 'b': [datetime.now(pytz.utc)]})
|
||
|
assert df['b'][0].tzinfo == pytz.utc
|
||
|
df = pd.DataFrame({'a': [1, 2, 3]})
|
||
|
df['b'] = datetime.now(pytz.utc)
|
||
|
assert df['b'][0].tzinfo == pytz.utc
|
||
|
|
||
|
def test_datetime_count(self):
|
||
|
df = DataFrame({'a': [1, 2, 3] * 2,
|
||
|
'dates': pd.date_range('now', periods=6, freq='T')})
|
||
|
result = df.groupby('a').dates.count()
|
||
|
expected = Series([
|
||
|
2, 2, 2
|
||
|
], index=Index([1, 2, 3], name='a'), name='dates')
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_first_last_max_min_on_time_data(self):
|
||
|
# GH 10295
|
||
|
# Verify that NaT is not in the result of max, min, first and last on
|
||
|
# Dataframe with datetime or timedelta values.
|
||
|
from datetime import timedelta as td
|
||
|
df_test = DataFrame(
|
||
|
{'dt': [nan, '2015-07-24 10:10', '2015-07-25 11:11',
|
||
|
'2015-07-23 12:12', nan],
|
||
|
'td': [nan, td(days=1), td(days=2), td(days=3), nan]})
|
||
|
df_test.dt = pd.to_datetime(df_test.dt)
|
||
|
df_test['group'] = 'A'
|
||
|
df_ref = df_test[df_test.dt.notna()]
|
||
|
|
||
|
grouped_test = df_test.groupby('group')
|
||
|
grouped_ref = df_ref.groupby('group')
|
||
|
|
||
|
assert_frame_equal(grouped_ref.max(), grouped_test.max())
|
||
|
assert_frame_equal(grouped_ref.min(), grouped_test.min())
|
||
|
assert_frame_equal(grouped_ref.first(), grouped_test.first())
|
||
|
assert_frame_equal(grouped_ref.last(), grouped_test.last())
|
||
|
|
||
|
def test_nunique_with_timegrouper_and_nat(self):
|
||
|
# GH 17575
|
||
|
test = pd.DataFrame({
|
||
|
'time': [Timestamp('2016-06-28 09:35:35'),
|
||
|
pd.NaT,
|
||
|
Timestamp('2016-06-28 16:46:28')],
|
||
|
'data': ['1', '2', '3']})
|
||
|
|
||
|
grouper = pd.Grouper(key='time', freq='h')
|
||
|
result = test.groupby(grouper)['data'].nunique()
|
||
|
expected = test[test.time.notnull()].groupby(grouper)['data'].nunique()
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
def test_scalar_call_versus_list_call(self):
|
||
|
# Issue: 17530
|
||
|
data_frame = {
|
||
|
'location': ['shanghai', 'beijing', 'shanghai'],
|
||
|
'time': pd.Series(['2017-08-09 13:32:23', '2017-08-11 23:23:15',
|
||
|
'2017-08-11 22:23:15'],
|
||
|
dtype='datetime64[ns]'),
|
||
|
'value': [1, 2, 3]
|
||
|
}
|
||
|
data_frame = pd.DataFrame(data_frame).set_index('time')
|
||
|
grouper = pd.Grouper(freq='D')
|
||
|
|
||
|
grouped = data_frame.groupby(grouper)
|
||
|
result = grouped.count()
|
||
|
grouped = data_frame.groupby([grouper])
|
||
|
expected = grouped.count()
|
||
|
|
||
|
assert_frame_equal(result, expected)
|