288 lines
9.3 KiB
Python
288 lines
9.3 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
test .agg behavior / note that .apply is tested generally in test_groupby.py
|
|
"""
|
|
|
|
import pytest
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from pandas import concat, DataFrame, Index, MultiIndex, Series
|
|
from pandas.core.groupby.groupby import Grouping, SpecificationError
|
|
from pandas.compat import OrderedDict
|
|
import pandas.util.testing as tm
|
|
|
|
|
|
def test_agg_regression1(tsframe):
|
|
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
|
|
result = grouped.agg(np.mean)
|
|
expected = grouped.mean()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_agg_must_agg(df):
|
|
grouped = df.groupby('A')['C']
|
|
|
|
msg = "Must produce aggregated value"
|
|
with tm.assert_raises_regex(Exception, msg):
|
|
grouped.agg(lambda x: x.describe())
|
|
with tm.assert_raises_regex(Exception, msg):
|
|
grouped.agg(lambda x: x.index[:2])
|
|
|
|
|
|
def test_agg_ser_multi_key(df):
|
|
# TODO(wesm): unused
|
|
ser = df.C # noqa
|
|
|
|
f = lambda x: x.sum()
|
|
results = df.C.groupby([df.A, df.B]).aggregate(f)
|
|
expected = df.groupby(['A', 'B']).sum()['C']
|
|
tm.assert_series_equal(results, expected)
|
|
|
|
|
|
def test_groupby_aggregation_mixed_dtype():
|
|
|
|
# GH 6212
|
|
expected = DataFrame({
|
|
'v1': [5, 5, 7, np.nan, 3, 3, 4, 1],
|
|
'v2': [55, 55, 77, np.nan, 33, 33, 44, 11]},
|
|
index=MultiIndex.from_tuples([(1, 95), (1, 99), (2, 95), (2, 99),
|
|
('big', 'damp'),
|
|
('blue', 'dry'),
|
|
('red', 'red'), ('red', 'wet')],
|
|
names=['by1', 'by2']))
|
|
|
|
df = DataFrame({
|
|
'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
|
|
'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
|
|
'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan,
|
|
12],
|
|
'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99,
|
|
np.nan, np.nan]
|
|
})
|
|
|
|
g = df.groupby(['by1', 'by2'])
|
|
result = g[['v1', 'v2']].mean()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_agg_apply_corner(ts, tsframe):
|
|
# nothing to group, all NA
|
|
grouped = ts.groupby(ts * np.nan)
|
|
assert ts.dtype == np.float64
|
|
|
|
# groupby float64 values results in Float64Index
|
|
exp = Series([], dtype=np.float64,
|
|
index=pd.Index([], dtype=np.float64))
|
|
tm.assert_series_equal(grouped.sum(), exp)
|
|
tm.assert_series_equal(grouped.agg(np.sum), exp)
|
|
tm.assert_series_equal(grouped.apply(np.sum), exp,
|
|
check_index_type=False)
|
|
|
|
# DataFrame
|
|
grouped = tsframe.groupby(tsframe['A'] * np.nan)
|
|
exp_df = DataFrame(columns=tsframe.columns, dtype=float,
|
|
index=pd.Index([], dtype=np.float64))
|
|
tm.assert_frame_equal(grouped.sum(), exp_df, check_names=False)
|
|
tm.assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False)
|
|
tm.assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0],
|
|
check_names=False)
|
|
|
|
|
|
def test_agg_grouping_is_list_tuple(ts):
|
|
df = tm.makeTimeDataFrame()
|
|
|
|
grouped = df.groupby(lambda x: x.year)
|
|
grouper = grouped.grouper.groupings[0].grouper
|
|
grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper))
|
|
|
|
result = grouped.agg(np.mean)
|
|
expected = grouped.mean()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper))
|
|
|
|
result = grouped.agg(np.mean)
|
|
expected = grouped.mean()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_agg_python_multiindex(mframe):
|
|
grouped = mframe.groupby(['A', 'B'])
|
|
|
|
result = grouped.agg(np.mean)
|
|
expected = grouped.mean()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize('groupbyfunc', [
|
|
lambda x: x.weekday(),
|
|
[lambda x: x.month, lambda x: x.weekday()],
|
|
])
|
|
def test_aggregate_str_func(tsframe, groupbyfunc):
|
|
grouped = tsframe.groupby(groupbyfunc)
|
|
|
|
# single series
|
|
result = grouped['A'].agg('std')
|
|
expected = grouped['A'].std()
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# group frame by function name
|
|
result = grouped.aggregate('var')
|
|
expected = grouped.var()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# group frame by function dict
|
|
result = grouped.agg(OrderedDict([['A', 'var'],
|
|
['B', 'std'],
|
|
['C', 'mean'],
|
|
['D', 'sem']]))
|
|
expected = DataFrame(OrderedDict([['A', grouped['A'].var()],
|
|
['B', grouped['B'].std()],
|
|
['C', grouped['C'].mean()],
|
|
['D', grouped['D'].sem()]]))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_aggregate_item_by_item(df):
|
|
grouped = df.groupby('A')
|
|
|
|
aggfun = lambda ser: ser.size
|
|
result = grouped.agg(aggfun)
|
|
foo = (df.A == 'foo').sum()
|
|
bar = (df.A == 'bar').sum()
|
|
K = len(result.columns)
|
|
|
|
# GH5782
|
|
# odd comparisons can result here, so cast to make easy
|
|
exp = pd.Series(np.array([foo] * K), index=list('BCD'),
|
|
dtype=np.float64, name='foo')
|
|
tm.assert_series_equal(result.xs('foo'), exp)
|
|
|
|
exp = pd.Series(np.array([bar] * K), index=list('BCD'),
|
|
dtype=np.float64, name='bar')
|
|
tm.assert_almost_equal(result.xs('bar'), exp)
|
|
|
|
def aggfun(ser):
|
|
return ser.size
|
|
|
|
result = DataFrame().groupby(df.A).agg(aggfun)
|
|
assert isinstance(result, DataFrame)
|
|
assert len(result) == 0
|
|
|
|
|
|
def test_wrap_agg_out(three_group):
|
|
grouped = three_group.groupby(['A', 'B'])
|
|
|
|
def func(ser):
|
|
if ser.dtype == np.object:
|
|
raise TypeError
|
|
else:
|
|
return ser.sum()
|
|
|
|
result = grouped.aggregate(func)
|
|
exp_grouped = three_group.loc[:, three_group.columns != 'C']
|
|
expected = exp_grouped.groupby(['A', 'B']).aggregate(func)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_agg_multiple_functions_maintain_order(df):
|
|
# GH #610
|
|
funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)]
|
|
result = df.groupby('A')['C'].agg(funcs)
|
|
exp_cols = Index(['mean', 'max', 'min'])
|
|
|
|
tm.assert_index_equal(result.columns, exp_cols)
|
|
|
|
|
|
def test_multiple_functions_tuples_and_non_tuples(df):
|
|
# #1359
|
|
funcs = [('foo', 'mean'), 'std']
|
|
ex_funcs = [('foo', 'mean'), ('std', 'std')]
|
|
|
|
result = df.groupby('A')['C'].agg(funcs)
|
|
expected = df.groupby('A')['C'].agg(ex_funcs)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.groupby('A').agg(funcs)
|
|
expected = df.groupby('A').agg(ex_funcs)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_agg_multiple_functions_too_many_lambdas(df):
|
|
grouped = df.groupby('A')
|
|
funcs = ['mean', lambda x: x.mean(), lambda x: x.std()]
|
|
|
|
msg = 'Function names must be unique, found multiple named <lambda>'
|
|
with tm.assert_raises_regex(SpecificationError, msg):
|
|
grouped.agg(funcs)
|
|
|
|
|
|
def test_more_flexible_frame_multi_function(df):
|
|
grouped = df.groupby('A')
|
|
|
|
exmean = grouped.agg(OrderedDict([['C', np.mean], ['D', np.mean]]))
|
|
exstd = grouped.agg(OrderedDict([['C', np.std], ['D', np.std]]))
|
|
|
|
expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1)
|
|
expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)
|
|
|
|
d = OrderedDict([['C', [np.mean, np.std]], ['D', [np.mean, np.std]]])
|
|
result = grouped.aggregate(d)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# be careful
|
|
result = grouped.aggregate(OrderedDict([['C', np.mean],
|
|
['D', [np.mean, np.std]]]))
|
|
expected = grouped.aggregate(OrderedDict([['C', np.mean],
|
|
['D', [np.mean, np.std]]]))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def foo(x):
|
|
return np.mean(x)
|
|
|
|
def bar(x):
|
|
return np.std(x, ddof=1)
|
|
|
|
# this uses column selection & renaming
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
d = OrderedDict([['C', np.mean],
|
|
['D', OrderedDict([['foo', np.mean],
|
|
['bar', np.std]])]])
|
|
result = grouped.aggregate(d)
|
|
|
|
d = OrderedDict([['C', [np.mean]], ['D', [foo, bar]]])
|
|
expected = grouped.aggregate(d)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_multi_function_flexible_mix(df):
|
|
# GH #1268
|
|
grouped = df.groupby('A')
|
|
|
|
# Expected
|
|
d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
|
|
['D', {'sum': 'sum'}]])
|
|
# this uses column selection & renaming
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
expected = grouped.aggregate(d)
|
|
|
|
# Test 1
|
|
d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
|
|
['D', 'sum']])
|
|
# this uses column selection & renaming
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
result = grouped.aggregate(d)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Test 2
|
|
d = OrderedDict([['C', OrderedDict([['foo', 'mean'], ['bar', 'std']])],
|
|
['D', ['sum']]])
|
|
# this uses column selection & renaming
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
result = grouped.aggregate(d)
|
|
tm.assert_frame_equal(result, expected)
|