# -*- coding: utf-8 -*- """ test all other .agg behavior """ from __future__ import print_function import pytest from collections import OrderedDict import datetime as dt from functools import partial import numpy as np import pandas as pd from pandas import ( date_range, DataFrame, Index, MultiIndex, PeriodIndex, period_range, Series ) from pandas.core.groupby.groupby import SpecificationError from pandas.io.formats.printing import pprint_thing import pandas.util.testing as tm def test_agg_api(): # GH 6337 # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error # different api for agg when passed custom function with mixed frame df = DataFrame({'data1': np.random.randn(5), 'data2': np.random.randn(5), 'key1': ['a', 'a', 'b', 'b', 'a'], 'key2': ['one', 'two', 'one', 'two', 'one']}) grouped = df.groupby('key1') def peak_to_peak(arr): return arr.max() - arr.min() expected = grouped.agg([peak_to_peak]) expected.columns = ['data1', 'data2'] result = grouped.agg(peak_to_peak) tm.assert_frame_equal(result, expected) def test_agg_datetimes_mixed(): data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]] df1 = DataFrame({'key': [x[0] for x in data], 'date': [x[1] for x in data], 'value': [x[2] for x in data]}) data = [[row[0], (dt.datetime.strptime(row[1], '%Y-%m-%d').date() if row[1] else None), row[2]] for row in data] df2 = DataFrame({'key': [x[0] for x in data], 'date': [x[1] for x in data], 'value': [x[2] for x in data]}) df1['weights'] = df1['value'] / df1['value'].sum() gb1 = df1.groupby('date').aggregate(np.sum) df2['weights'] = df1['value'] / df1['value'].sum() gb2 = df2.groupby('date').aggregate(np.sum) assert (len(gb1) == len(gb2)) def test_agg_period_index(): prng = period_range('2012-1-1', freq='M', periods=3) df = DataFrame(np.random.randn(3, 2), index=prng) rs = df.groupby(level=0).sum() assert isinstance(rs.index, PeriodIndex) # GH 3579 index = period_range(start='1999-01', periods=5, freq='M') s1 = Series(np.random.rand(len(index)), index=index) s2 = Series(np.random.rand(len(index)), index=index) series = [('s1', s1), ('s2', s2)] df = DataFrame.from_dict(OrderedDict(series)) grouped = df.groupby(df.index.month) list(grouped) def test_agg_dict_parameter_cast_result_dtypes(): # GH 12821 df = DataFrame({'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], 'time': date_range('1/1/2011', periods=8, freq='H')}) df.loc[[0, 1, 2, 5], 'time'] = None # test for `first` function exp = df.loc[[0, 3, 4, 6]].set_index('class') grouped = df.groupby('class') tm.assert_frame_equal(grouped.first(), exp) tm.assert_frame_equal(grouped.agg('first'), exp) tm.assert_frame_equal(grouped.agg({'time': 'first'}), exp) tm.assert_series_equal(grouped.time.first(), exp['time']) tm.assert_series_equal(grouped.time.agg('first'), exp['time']) # test for `last` function exp = df.loc[[0, 3, 4, 7]].set_index('class') grouped = df.groupby('class') tm.assert_frame_equal(grouped.last(), exp) tm.assert_frame_equal(grouped.agg('last'), exp) tm.assert_frame_equal(grouped.agg({'time': 'last'}), exp) tm.assert_series_equal(grouped.time.last(), exp['time']) tm.assert_series_equal(grouped.time.agg('last'), exp['time']) # count exp = pd.Series([2, 2, 2, 2], index=Index(list('ABCD'), name='class'), name='time') tm.assert_series_equal(grouped.time.agg(len), exp) tm.assert_series_equal(grouped.time.size(), exp) exp = pd.Series([0, 1, 1, 2], index=Index(list('ABCD'), name='class'), name='time') tm.assert_series_equal(grouped.time.count(), exp) def test_agg_cast_results_dtypes(): # similar to GH12821 # xref #11444 u = [dt.datetime(2015, x + 1, 1) for x in range(12)] v = list('aaabbbbbbccd') df = pd.DataFrame({'X': v, 'Y': u}) result = df.groupby('X')['Y'].agg(len) expected = df.groupby('X')['Y'].count() tm.assert_series_equal(result, expected) def test_aggregate_float64_no_int64(): # see gh-11199 df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]}) expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) expected.index.name = "b" result = df.groupby("b")[["a"]].mean() tm.assert_frame_equal(result, expected) expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5]) expected.index.name = "b" result = df.groupby("b")[["a", "c"]].mean() tm.assert_frame_equal(result, expected) def test_aggregate_api_consistency(): # GH 9052 # make sure that the aggregates via dict # are consistent df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': np.random.randn(8) + 1.0, 'D': np.arange(8)}) grouped = df.groupby(['A', 'B']) c_mean = grouped['C'].mean() c_sum = grouped['C'].sum() d_mean = grouped['D'].mean() d_sum = grouped['D'].sum() result = grouped['D'].agg(['sum', 'mean']) expected = pd.concat([d_sum, d_mean], axis=1) expected.columns = ['sum', 'mean'] tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg([np.sum, np.mean]) expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1) expected.columns = MultiIndex.from_product([['C', 'D'], ['sum', 'mean']]) tm.assert_frame_equal(result, expected, check_like=True) result = grouped[['D', 'C']].agg([np.sum, np.mean]) expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1) expected.columns = MultiIndex.from_product([['D', 'C'], ['sum', 'mean']]) tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg({'C': 'mean', 'D': 'sum'}) expected = pd.concat([d_sum, c_mean], axis=1) tm.assert_frame_equal(result, expected, check_like=True) result = grouped.agg({'C': ['mean', 'sum'], 'D': ['mean', 'sum']}) expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1) expected.columns = MultiIndex.from_product([['C', 'D'], ['mean', 'sum']]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = grouped[['D', 'C']].agg({'r': np.sum, 'r2': np.mean}) expected = pd.concat([d_sum, c_sum, d_mean, c_mean], axis=1) expected.columns = MultiIndex.from_product([['r', 'r2'], ['D', 'C']]) tm.assert_frame_equal(result, expected, check_like=True) def test_agg_dict_renaming_deprecation(): # 15931 df = pd.DataFrame({'A': [1, 1, 1, 2, 2], 'B': range(5), 'C': range(5)}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False) as w: df.groupby('A').agg({'B': {'foo': ['sum', 'max']}, 'C': {'bar': ['count', 'min']}}) assert "using a dict with renaming" in str(w[0].message) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): df.groupby('A')[['B', 'C']].agg({'ma': 'max'}) with tm.assert_produces_warning(FutureWarning) as w: df.groupby('A').B.agg({'foo': 'count'}) assert "using a dict on a Series for aggregation" in str(w[0].message) def test_agg_compat(): # GH 12334 df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': np.random.randn(8) + 1.0, 'D': np.arange(8)}) g = df.groupby(['A', 'B']) expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) expected.columns = MultiIndex.from_tuples([('C', 'sum'), ('C', 'std')]) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = g['D'].agg({'C': ['sum', 'std']}) tm.assert_frame_equal(result, expected, check_like=True) expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) expected.columns = ['C', 'D'] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = g['D'].agg({'C': 'sum', 'D': 'std'}) tm.assert_frame_equal(result, expected, check_like=True) def test_agg_nested_dicts(): # API change for disallowing these types of nested dicts df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': np.random.randn(8) + 1.0, 'D': np.arange(8)}) g = df.groupby(['A', 'B']) msg = r'cannot perform renaming for r[1-2] with a nested dictionary' with tm.assert_raises_regex(SpecificationError, msg): g.aggregate({'r1': {'C': ['mean', 'sum']}, 'r2': {'D': ['mean', 'sum']}}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = g.agg({'C': {'ra': ['mean', 'std']}, 'D': {'rb': ['mean', 'std']}}) expected = pd.concat([g['C'].mean(), g['C'].std(), g['D'].mean(), g['D'].std()], axis=1) expected.columns = pd.MultiIndex.from_tuples( [('ra', 'mean'), ('ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) tm.assert_frame_equal(result, expected, check_like=True) # same name as the original column # GH9052 with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): expected = g['D'].agg({'result1': np.sum, 'result2': np.mean}) expected = expected.rename(columns={'result1': 'D'}) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): result = g['D'].agg({'D': np.sum, 'result2': np.mean}) tm.assert_frame_equal(result, expected, check_like=True) def test_agg_item_by_item_raise_typeerror(): df = DataFrame(np.random.randint(10, size=(20, 10))) def raiseException(df): pprint_thing('----------------------------------------') pprint_thing(df.to_string()) raise TypeError('test') with tm.assert_raises_regex(TypeError, 'test'): df.groupby(0).agg(raiseException) def test_series_agg_multikey(): ts = tm.makeTimeSeries() grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.agg(np.sum) expected = grouped.sum() tm.assert_series_equal(result, expected) def test_series_agg_multi_pure_python(): data = DataFrame( {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo'], 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one'], 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny'], 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11)}) def bad(x): assert (len(x.values.base) > 0) return 'foo' result = data.groupby(['A', 'B']).agg(bad) expected = data.groupby(['A', 'B']).agg(lambda x: 'foo') tm.assert_frame_equal(result, expected) def test_agg_consistency(): # agg with ([]) and () not consistent # GH 6715 def P1(a): try: return np.percentile(a.dropna(), q=1) except Exception: return np.nan df = DataFrame({'col1': [1, 2, 3, 4], 'col2': [10, 25, 26, 31], 'date': [dt.date(2013, 2, 10), dt.date(2013, 2, 10), dt.date(2013, 2, 11), dt.date(2013, 2, 11)]}) g = df.groupby('date') expected = g.agg([P1]) expected.columns = expected.columns.levels[0] result = g.agg(P1) tm.assert_frame_equal(result, expected) def test_agg_callables(): # GH 7929 df = DataFrame({'foo': [1, 2], 'bar': [3, 4]}).astype(np.int64) class fn_class(object): def __call__(self, x): return sum(x) equiv_callables = [sum, np.sum, lambda x: sum(x), lambda x: x.sum(), partial(sum), fn_class(), ] expected = df.groupby("foo").agg(sum) for ecall in equiv_callables: result = df.groupby('foo').agg(ecall) tm.assert_frame_equal(result, expected) def test_agg_over_numpy_arrays(): # GH 3788 df = pd.DataFrame([[1, np.array([10, 20, 30])], [1, np.array([40, 50, 60])], [2, np.array([20, 30, 40])]], columns=['category', 'arraydata']) result = df.groupby('category').agg(sum) expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]] expected_index = pd.Index([1, 2], name='category') expected_column = ['arraydata'] expected = pd.DataFrame(expected_data, index=expected_index, columns=expected_column) tm.assert_frame_equal(result, expected) def test_agg_timezone_round_trip(): # GH 15426 ts = pd.Timestamp("2016-01-01 12:00:00", tz='US/Pacific') df = pd.DataFrame({'a': 1, 'b': [ts + dt.timedelta(minutes=nn) for nn in range(10)]}) result1 = df.groupby('a')['b'].agg(np.min).iloc[0] result2 = df.groupby('a')['b'].agg(lambda x: np.min(x)).iloc[0] result3 = df.groupby('a')['b'].min().iloc[0] assert result1 == ts assert result2 == ts assert result3 == ts dates = [pd.Timestamp("2016-01-0%d 12:00:00" % i, tz='US/Pacific') for i in range(1, 5)] df = pd.DataFrame({'A': ['a', 'b'] * 2, 'B': dates}) grouped = df.groupby('A') ts = df['B'].iloc[0] assert ts == grouped.nth(0)['B'].iloc[0] assert ts == grouped.head(1)['B'].iloc[0] assert ts == grouped.first()['B'].iloc[0] assert ts == grouped.apply(lambda x: x.iloc[0])[0] ts = df['B'].iloc[2] assert ts == grouped.last()['B'].iloc[0] assert ts == grouped.apply(lambda x: x.iloc[-1])[0] def test_sum_uint64_overflow(): # see gh-14758 # Convert to uint64 and don't overflow df = pd.DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object) df = df + 9223372036854775807 index = pd.Index([9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64) expected = pd.DataFrame({1: [9223372036854775809, 9223372036854775811, 9223372036854775813]}, index=index) expected.index.name = 0 result = df.groupby(0).sum() tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("structure, expected", [ (tuple, pd.DataFrame({'C': {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), (list, pd.DataFrame({'C': {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})), (lambda x: tuple(x), pd.DataFrame({'C': {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})), (lambda x: list(x), pd.DataFrame({'C': {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})) ]) def test_agg_structs_dataframe(structure, expected): df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3], 'B': [1, 1, 1, 4, 4, 4], 'C': [1, 1, 1, 3, 4, 4]}) result = df.groupby(['A', 'B']).aggregate(structure) expected.index.names = ['A', 'B'] tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("structure, expected", [ (tuple, pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name='C')), (list, pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name='C')), (lambda x: tuple(x), pd.Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name='C')), (lambda x: list(x), pd.Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name='C')) ]) def test_agg_structs_series(structure, expected): # Issue #18079 df = pd.DataFrame({'A': [1, 1, 1, 3, 3, 3], 'B': [1, 1, 1, 4, 4, 4], 'C': [1, 1, 1, 3, 4, 4]}) result = df.groupby('A')['C'].aggregate(structure) expected.index.name = 'A' tm.assert_series_equal(result, expected) @pytest.mark.xfail(reason="GH-18869: agg func not called on empty groups.") def test_agg_category_nansum(observed): categories = ['a', 'b', 'c'] df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'], categories=categories), 'B': [1, 2, 3]}) result = df.groupby("A", observed=observed).B.agg(np.nansum) expected = pd.Series([3, 3, 0], index=pd.CategoricalIndex(['a', 'b', 'c'], categories=categories, name='A'), name='B') if observed: expected = expected[expected != 0] tm.assert_series_equal(result, expected)