import pytest import numpy as np import pandas as pd from pandas import (DataFrame, Index, compat, isna, Series, MultiIndex, Timestamp, date_range) from pandas.errors import UnsupportedFunctionCall from pandas.util import testing as tm import pandas.core.nanops as nanops from string import ascii_lowercase from pandas.compat import product as cart_product @pytest.mark.parametrize("agg_func", ['any', 'all']) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("vals", [ ['foo', 'bar', 'baz'], ['foo', '', ''], ['', '', ''], [1, 2, 3], [1, 0, 0], [0, 0, 0], [1., 2., 3.], [1., 0., 0.], [0., 0., 0.], [True, True, True], [True, False, False], [False, False, False], [np.nan, np.nan, np.nan] ]) def test_groupby_bool_aggs(agg_func, skipna, vals): df = DataFrame({'key': ['a'] * 3 + ['b'] * 3, 'val': vals * 2}) # Figure out expectation using Python builtin exp = getattr(compat.builtins, agg_func)(vals) # edge case for missing data with skipna and 'any' if skipna and all(isna(vals)) and agg_func == 'any': exp = False exp_df = DataFrame([exp] * 2, columns=['val'], index=Index( ['a', 'b'], name='key')) result = getattr(df.groupby('key'), agg_func)(skipna=skipna) tm.assert_frame_equal(result, exp_df) def test_max_min_non_numeric(): # #2700 aa = DataFrame({'nn': [11, 11, 22, 22], 'ii': [1, 2, 3, 4], 'ss': 4 * ['mama']}) result = aa.groupby('nn').max() assert 'ss' in result result = aa.groupby('nn').max(numeric_only=False) assert 'ss' in result result = aa.groupby('nn').min() assert 'ss' in result result = aa.groupby('nn').min(numeric_only=False) assert 'ss' in result def test_intercept_builtin_sum(): s = Series([1., 2., np.nan, 3.]) grouped = s.groupby([0, 1, 2, 2]) result = grouped.agg(compat.builtins.sum) result2 = grouped.apply(compat.builtins.sum) expected = grouped.sum() tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) def test_builtins_apply(): # GH8155 df = pd.DataFrame(np.random.randint(1, 50, (1000, 2)), columns=['jim', 'joe']) df['jolie'] = np.random.randn(1000) for keys in ['jim', ['jim', 'joe']]: # single key & multi-key if keys == 'jim': continue for f in [max, min, sum]: fname = f.__name__ result = df.groupby(keys).apply(f) result.shape ngroups = len(df.drop_duplicates(subset=keys)) assert result.shape == (ngroups, 3), 'invalid frame shape: '\ '{} (expected ({}, 3))'.format(result.shape, ngroups) tm.assert_frame_equal(result, # numpy's equivalent function df.groupby(keys).apply(getattr(np, fname))) if f != sum: expected = df.groupby(keys).agg(fname).reset_index() expected.set_index(keys, inplace=True, drop=False) tm.assert_frame_equal(result, expected, check_dtype=False) tm.assert_series_equal(getattr(result, fname)(), getattr(df, fname)()) def test_arg_passthru(): # make sure that we are passing thru kwargs # to our agg functions # GH3668 # GH5724 df = pd.DataFrame( {'group': [1, 1, 2], 'int': [1, 2, 3], 'float': [4., 5., 6.], 'string': list('abc'), 'category_string': pd.Series(list('abc')).astype('category'), 'category_int': [7, 8, 9], 'datetime': pd.date_range('20130101', periods=3), 'datetimetz': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')}, columns=['group', 'int', 'float', 'string', 'category_string', 'category_int', 'datetime', 'datetimetz', 'timedelta']) expected_columns_numeric = Index(['int', 'float', 'category_int']) # mean / median expected = pd.DataFrame( {'category_int': [7.5, 9], 'float': [4.5, 6.], 'timedelta': [pd.Timedelta('1.5s'), pd.Timedelta('3s')], 'int': [1.5, 3], 'datetime': [pd.Timestamp('2013-01-01 12:00:00'), pd.Timestamp('2013-01-03 00:00:00')], 'datetimetz': [ pd.Timestamp('2013-01-01 12:00:00', tz='US/Eastern'), pd.Timestamp('2013-01-03 00:00:00', tz='US/Eastern')]}, index=Index([1, 2], name='group'), columns=['int', 'float', 'category_int', 'datetime', 'datetimetz', 'timedelta']) for attr in ['mean', 'median']: f = getattr(df.groupby('group'), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) result = f(numeric_only=False) tm.assert_frame_equal(result.reindex_like(expected), expected) # TODO: min, max *should* handle # categorical (ordered) dtype expected_columns = Index(['int', 'float', 'string', 'category_int', 'datetime', 'datetimetz', 'timedelta']) for attr in ['min', 'max']: f = getattr(df.groupby('group'), attr) result = f() tm.assert_index_equal(result.columns, expected_columns) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) expected_columns = Index(['int', 'float', 'string', 'category_string', 'category_int', 'datetime', 'datetimetz', 'timedelta']) for attr in ['first', 'last']: f = getattr(df.groupby('group'), attr) result = f() tm.assert_index_equal(result.columns, expected_columns) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) expected_columns = Index(['int', 'float', 'string', 'category_int', 'timedelta']) for attr in ['sum']: f = getattr(df.groupby('group'), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) expected_columns = Index(['int', 'float', 'category_int']) for attr in ['prod', 'cumprod']: f = getattr(df.groupby('group'), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) # like min, max, but don't include strings expected_columns = Index(['int', 'float', 'category_int', 'datetime', 'datetimetz', 'timedelta']) for attr in ['cummin', 'cummax']: f = getattr(df.groupby('group'), attr) result = f() # GH 15561: numeric_only=False set by default like min/max tm.assert_index_equal(result.columns, expected_columns) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) expected_columns = Index(['int', 'float', 'category_int', 'timedelta']) for attr in ['cumsum']: f = getattr(df.groupby('group'), attr) result = f() tm.assert_index_equal(result.columns, expected_columns_numeric) result = f(numeric_only=False) tm.assert_index_equal(result.columns, expected_columns) def test_non_cython_api(): # GH5610 # non-cython calls should not include the grouper df = DataFrame( [[1, 2, 'foo'], [1, np.nan, 'bar'], [3, np.nan, 'baz']], columns=['A', 'B', 'C']) g = df.groupby('A') gni = df.groupby('A', as_index=False) # mad expected = DataFrame([[0], [np.nan]], columns=['B'], index=[1, 3]) expected.index.name = 'A' result = g.mad() tm.assert_frame_equal(result, expected) expected = DataFrame([[0., 0.], [0, np.nan]], columns=['A', 'B'], index=[0, 1]) result = gni.mad() tm.assert_frame_equal(result, expected) # describe expected_index = pd.Index([1, 3], name='A') expected_col = pd.MultiIndex(levels=[['B'], ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']], labels=[[0] * 8, list(range(8))]) expected = pd.DataFrame([[1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0], [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]], index=expected_index, columns=expected_col) result = g.describe() tm.assert_frame_equal(result, expected) expected = pd.concat([df[df.A == 1].describe().unstack().to_frame().T, df[df.A == 3].describe().unstack().to_frame().T]) expected.index = pd.Index([0, 1]) result = gni.describe() tm.assert_frame_equal(result, expected) # any expected = DataFrame([[True, True], [False, True]], columns=['B', 'C'], index=[1, 3]) expected.index.name = 'A' result = g.any() tm.assert_frame_equal(result, expected) # idxmax expected = DataFrame([[0.0], [np.nan]], columns=['B'], index=[1, 3]) expected.index.name = 'A' result = g.idxmax() tm.assert_frame_equal(result, expected) def test_cython_api2(): # this takes the fast apply path # cumsum (GH5614) df = DataFrame( [[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9] ], columns=['A', 'B', 'C']) expected = DataFrame( [[2, np.nan], [np.nan, 9], [4, 9]], columns=['B', 'C']) result = df.groupby('A').cumsum() tm.assert_frame_equal(result, expected) # GH 5755 - cumsum is a transformer and should ignore as_index result = df.groupby('A', as_index=False).cumsum() tm.assert_frame_equal(result, expected) # GH 13994 result = df.groupby('A').cumsum(axis=1) expected = df.cumsum(axis=1) tm.assert_frame_equal(result, expected) result = df.groupby('A').cumprod(axis=1) expected = df.cumprod(axis=1) tm.assert_frame_equal(result, expected) def test_cython_median(): df = DataFrame(np.random.randn(1000)) df.values[::2] = np.nan labels = np.random.randint(0, 50, size=1000).astype(float) labels[::17] = np.nan result = df.groupby(labels).median() exp = df.groupby(labels).agg(nanops.nanmedian) tm.assert_frame_equal(result, exp) df = DataFrame(np.random.randn(1000, 5)) rs = df.groupby(labels).agg(np.median) xp = df.groupby(labels).median() tm.assert_frame_equal(rs, xp) def test_median_empty_bins(observed): df = pd.DataFrame(np.random.randint(0, 44, 500)) grps = range(0, 55, 5) bins = pd.cut(df[0], grps) result = df.groupby(bins, observed=observed).median() expected = df.groupby(bins, observed=observed).agg(lambda x: x.median()) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype", [ 'int8', 'int16', 'int32', 'int64', 'float32', 'float64']) @pytest.mark.parametrize("method,data", [ ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}), ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}), ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}], 'args': [1]}), ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}], 'out_type': 'int64'}) ]) def test_groupby_non_arithmetic_agg_types(dtype, method, data): # GH9311, GH6620 df = pd.DataFrame( [{'a': 1, 'b': 1}, {'a': 1, 'b': 2}, {'a': 2, 'b': 3}, {'a': 2, 'b': 4}]) df['b'] = df.b.astype(dtype) if 'args' not in data: data['args'] = [] if 'out_type' in data: out_type = data['out_type'] else: out_type = dtype exp = data['df'] df_out = pd.DataFrame(exp) df_out['b'] = df_out.b.astype(out_type) df_out.set_index('a', inplace=True) grpd = df.groupby('a') t = getattr(grpd, method)(*data['args']) tm.assert_frame_equal(t, df_out) def test_groupby_non_arithmetic_agg_intlike_precision(): # GH9311, GH6620 c = 24650000000000000 inputs = ((Timestamp('2011-01-15 12:50:28.502376'), Timestamp('2011-01-20 12:50:28.593448')), (1 + c, 2 + c)) for i in inputs: df = pd.DataFrame([{'a': 1, 'b': i[0]}, {'a': 1, 'b': i[1]}]) grp_exp = {'first': {'expected': i[0]}, 'last': {'expected': i[1]}, 'min': {'expected': i[0]}, 'max': {'expected': i[1]}, 'nth': {'expected': i[1], 'args': [1]}, 'count': {'expected': 2}} for method, data in compat.iteritems(grp_exp): if 'args' not in data: data['args'] = [] grpd = df.groupby('a') res = getattr(grpd, method)(*data['args']) assert res.iloc[0].b == data['expected'] def test_fill_constistency(): # GH9221 # pass thru keyword arguments to the generated wrapper # are set if the passed kw is None (only) df = DataFrame(index=pd.MultiIndex.from_product( [['value1', 'value2'], date_range('2014-01-01', '2014-01-06')]), columns=Index( ['1', '2'], name='id')) df['1'] = [np.nan, 1, np.nan, np.nan, 11, np.nan, np.nan, 2, np.nan, np.nan, 22, np.nan] df['2'] = [np.nan, 3, np.nan, np.nan, 33, np.nan, np.nan, 4, np.nan, np.nan, 44, np.nan] expected = df.groupby(level=0, axis=0).fillna(method='ffill') result = df.T.groupby(level=0, axis=1).fillna(method='ffill').T tm.assert_frame_equal(result, expected) def test_groupby_cumprod(): # GH 4095 df = pd.DataFrame({'key': ['b'] * 10, 'value': 2}) actual = df.groupby('key')['value'].cumprod() expected = df.groupby('key')['value'].apply(lambda x: x.cumprod()) expected.name = 'value' tm.assert_series_equal(actual, expected) df = pd.DataFrame({'key': ['b'] * 100, 'value': 2}) actual = df.groupby('key')['value'].cumprod() # if overflows, groupby product casts to float # while numpy passes back invalid values df['value'] = df['value'].astype(float) expected = df.groupby('key')['value'].apply(lambda x: x.cumprod()) expected.name = 'value' tm.assert_series_equal(actual, expected) def test_ops_general(): ops = [('mean', np.mean), ('median', np.median), ('std', np.std), ('var', np.var), ('sum', np.sum), ('prod', np.prod), ('min', np.min), ('max', np.max), ('first', lambda x: x.iloc[0]), ('last', lambda x: x.iloc[-1]), ('count', np.size), ] try: from scipy.stats import sem except ImportError: pass else: ops.append(('sem', sem)) df = DataFrame(np.random.randn(1000)) labels = np.random.randint(0, 50, size=1000).astype(float) for op, targop in ops: result = getattr(df.groupby(labels), op)().astype(float) expected = df.groupby(labels).agg(targop) try: tm.assert_frame_equal(result, expected) except BaseException as exc: exc.args += ('operation: %s' % op, ) raise def test_max_nan_bug(): raw = """,Date,app,File -04-23,2013-04-23 00:00:00,,log080001.log -05-06,2013-05-06 00:00:00,,log.log -05-07,2013-05-07 00:00:00,OE,xlsx""" df = pd.read_csv(compat.StringIO(raw), parse_dates=[0]) gb = df.groupby('Date') r = gb[['File']].max() e = gb['File'].max().to_frame() tm.assert_frame_equal(r, e) assert not r['File'].isna().any() def test_nlargest(): a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) b = Series(list('a' * 5 + 'b' * 5)) gb = a.groupby(b) r = gb.nlargest(3) e = Series([ 7, 5, 3, 10, 9, 6 ], index=MultiIndex.from_arrays([list('aaabbb'), [3, 2, 1, 9, 5, 8]])) tm.assert_series_equal(r, e) a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) gb = a.groupby(b) e = Series([ 3, 2, 1, 3, 3, 2 ], index=MultiIndex.from_arrays([list('aaabbb'), [2, 3, 1, 6, 5, 7]])) tm.assert_series_equal(gb.nlargest(3, keep='last'), e) def test_nsmallest(): a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10]) b = Series(list('a' * 5 + 'b' * 5)) gb = a.groupby(b) r = gb.nsmallest(3) e = Series([ 1, 2, 3, 0, 4, 6 ], index=MultiIndex.from_arrays([list('aaabbb'), [0, 4, 1, 6, 7, 8]])) tm.assert_series_equal(r, e) a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0]) gb = a.groupby(b) e = Series([ 0, 1, 1, 0, 1, 2 ], index=MultiIndex.from_arrays([list('aaabbb'), [4, 1, 0, 9, 8, 7]])) tm.assert_series_equal(gb.nsmallest(3, keep='last'), e) def test_numpy_compat(): # see gh-12811 df = pd.DataFrame({'A': [1, 2, 1], 'B': [1, 2, 3]}) g = df.groupby('A') msg = "numpy operations are not valid with groupby" for func in ('mean', 'var', 'std', 'cumprod', 'cumsum'): tm.assert_raises_regex(UnsupportedFunctionCall, msg, getattr(g, func), 1, 2, 3) tm.assert_raises_regex(UnsupportedFunctionCall, msg, getattr(g, func), foo=1) def test_cummin_cummax(): # GH 15048 num_types = [np.int32, np.int64, np.float32, np.float64] num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min, np.finfo(np.float32).min, np.finfo(np.float64).min] num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max, np.finfo(np.float32).max, np.finfo(np.float64).max] base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2], 'B': [3, 4, 3, 2, 2, 3, 2, 1]}) expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] for dtype, min_val, max_val in zip(num_types, num_mins, num_max): df = base_df.astype(dtype) # cummin expected = pd.DataFrame({'B': expected_mins}).astype(dtype) result = df.groupby('A').cummin() tm.assert_frame_equal(result, expected) result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(result, expected) # Test cummin w/ min value for dtype df.loc[[2, 6], 'B'] = min_val expected.loc[[2, 3, 6, 7], 'B'] = min_val result = df.groupby('A').cummin() tm.assert_frame_equal(result, expected) expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(result, expected) # cummax expected = pd.DataFrame({'B': expected_maxs}).astype(dtype) result = df.groupby('A').cummax() tm.assert_frame_equal(result, expected) result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(result, expected) # Test cummax w/ max value for dtype df.loc[[2, 6], 'B'] = max_val expected.loc[[2, 3, 6, 7], 'B'] = max_val result = df.groupby('A').cummax() tm.assert_frame_equal(result, expected) expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(result, expected) # Test nan in some values base_df.loc[[0, 2, 4, 6], 'B'] = np.nan expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]}) result = base_df.groupby('A').cummin() tm.assert_frame_equal(result, expected) expected = (base_df.groupby('A') .B .apply(lambda x: x.cummin()) .to_frame()) tm.assert_frame_equal(result, expected) expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) result = base_df.groupby('A').cummax() tm.assert_frame_equal(result, expected) expected = (base_df.groupby('A') .B .apply(lambda x: x.cummax()) .to_frame()) tm.assert_frame_equal(result, expected) # Test nan in entire column base_df['B'] = np.nan expected = pd.DataFrame({'B': [np.nan] * 8}) result = base_df.groupby('A').cummin() tm.assert_frame_equal(expected, result) result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() tm.assert_frame_equal(expected, result) result = base_df.groupby('A').cummax() tm.assert_frame_equal(expected, result) result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() tm.assert_frame_equal(expected, result) # GH 15561 df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(['2001']))) expected = pd.Series(pd.to_datetime('2001'), index=[0], name='b') for method in ['cummax', 'cummin']: result = getattr(df.groupby('a')['b'], method)() tm.assert_series_equal(expected, result) # GH 15635 df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1])) result = df.groupby('a').b.cummax() expected = pd.Series([2, 1, 2], name='b') tm.assert_series_equal(result, expected) df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2])) result = df.groupby('a').b.cummin() expected = pd.Series([1, 2, 1], name='b') tm.assert_series_equal(result, expected) @pytest.mark.parametrize('in_vals, out_vals', [ # Basics: strictly increasing (T), strictly decreasing (F), # abs val increasing (F), non-strictly increasing (T) ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]), # Test with inf vals ([1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf], [True, False, True, False]), # Test with nan vals; should always be False ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], [False, False, False, False]), ]) def test_is_monotonic_increasing(in_vals, out_vals): # GH 17015 source_dict = { 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], 'C': in_vals} df = pd.DataFrame(source_dict) result = df.groupby('B').C.is_monotonic_increasing index = Index(list('abcd'), name='B') expected = pd.Series(index=index, data=out_vals, name='C') tm.assert_series_equal(result, expected) # Also check result equal to manually taking x.is_monotonic_increasing. expected = ( df.groupby(['B']).C.apply(lambda x: x.is_monotonic_increasing)) tm.assert_series_equal(result, expected) @pytest.mark.parametrize('in_vals, out_vals', [ # Basics: strictly decreasing (T), strictly increasing (F), # abs val decreasing (F), non-strictly increasing (T) ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]), # Test with inf vals ([np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf], [True, True, False, True]), # Test with nan vals; should always be False ([1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan], [False, False, False, False]), ]) def test_is_monotonic_decreasing(in_vals, out_vals): # GH 17015 source_dict = { 'A': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11'], 'B': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd'], 'C': in_vals} df = pd.DataFrame(source_dict) result = df.groupby('B').C.is_monotonic_decreasing index = Index(list('abcd'), name='B') expected = pd.Series(index=index, data=out_vals, name='C') tm.assert_series_equal(result, expected) # describe # -------------------------------- def test_apply_describe_bug(mframe): grouped = mframe.groupby(level='first') grouped.describe() # it works! def test_series_describe_multikey(): ts = tm.makeTimeSeries() grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() tm.assert_series_equal(result['mean'], grouped.mean(), check_names=False) tm.assert_series_equal(result['std'], grouped.std(), check_names=False) tm.assert_series_equal(result['min'], grouped.min(), check_names=False) def test_series_describe_single(): ts = tm.makeTimeSeries() grouped = ts.groupby(lambda x: x.month) result = grouped.apply(lambda x: x.describe()) expected = grouped.describe().stack() tm.assert_series_equal(result, expected) def test_series_index_name(df): grouped = df.loc[:, ['C']].groupby(df['A']) result = grouped.agg(lambda x: x.mean()) assert result.index.name == 'A' def test_frame_describe_multikey(tsframe): grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month]) result = grouped.describe() desc_groups = [] for col in tsframe: group = grouped[col].describe() # GH 17464 - Remove duplicate MultiIndex levels group_col = pd.MultiIndex( levels=[[col], group.columns], labels=[[0] * len(group.columns), range(len(group.columns))]) group = pd.DataFrame(group.values, columns=group_col, index=group.index) desc_groups.append(group) expected = pd.concat(desc_groups, axis=1) tm.assert_frame_equal(result, expected) groupedT = tsframe.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1}, axis=1) result = groupedT.describe() expected = tsframe.describe().T expected.index = pd.MultiIndex( levels=[[0, 1], expected.index], labels=[[0, 0, 1, 1], range(len(expected.index))]) tm.assert_frame_equal(result, expected) def test_frame_describe_tupleindex(): # GH 14848 - regression from 0.19.0 to 0.19.1 df1 = DataFrame({'x': [1, 2, 3, 4, 5] * 3, 'y': [10, 20, 30, 40, 50] * 3, 'z': [100, 200, 300, 400, 500] * 3}) df1['k'] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5 df2 = df1.rename(columns={'k': 'key'}) pytest.raises(ValueError, lambda: df1.groupby('k').describe()) pytest.raises(ValueError, lambda: df2.groupby('key').describe()) def test_frame_describe_unstacked_format(): # GH 4792 prices = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 24990, pd.Timestamp('2011-01-06 12:43:33', tz=None): 25499, pd.Timestamp('2011-01-06 12:54:09', tz=None): 25499} volumes = {pd.Timestamp('2011-01-06 10:59:05', tz=None): 1500000000, pd.Timestamp('2011-01-06 12:43:33', tz=None): 5000000000, pd.Timestamp('2011-01-06 12:54:09', tz=None): 100000000} df = pd.DataFrame({'PRICE': prices, 'VOLUME': volumes}) result = df.groupby('PRICE').VOLUME.describe() data = [df[df.PRICE == 24990].VOLUME.describe().values.tolist(), df[df.PRICE == 25499].VOLUME.describe().values.tolist()] expected = pd.DataFrame(data, index=pd.Index([24990, 25499], name='PRICE'), columns=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']) tm.assert_frame_equal(result, expected) # nunique # -------------------------------- @pytest.mark.parametrize("n, m", cart_product(10 ** np.arange(2, 6), (10, 100, 1000))) @pytest.mark.parametrize("sort, dropna", cart_product((False, True), repeat=2)) def test_series_groupby_nunique(n, m, sort, dropna): def check_nunique(df, keys, as_index=True): gr = df.groupby(keys, as_index=as_index, sort=sort) left = gr['julie'].nunique(dropna=dropna) gr = df.groupby(keys, as_index=as_index, sort=sort) right = gr['julie'].apply(Series.nunique, dropna=dropna) if not as_index: right = right.reset_index(drop=True) tm.assert_series_equal(left, right, check_names=False) days = date_range('2015-08-23', periods=10) frame = DataFrame({'jim': np.random.choice(list(ascii_lowercase), n), 'joe': np.random.choice(days, n), 'julie': np.random.randint(0, m, n)}) check_nunique(frame, ['jim']) check_nunique(frame, ['jim', 'joe']) frame.loc[1::17, 'jim'] = None frame.loc[3::37, 'joe'] = None frame.loc[7::19, 'julie'] = None frame.loc[8::19, 'julie'] = None frame.loc[9::19, 'julie'] = None check_nunique(frame, ['jim']) check_nunique(frame, ['jim', 'joe']) check_nunique(frame, ['jim'], as_index=False) check_nunique(frame, ['jim', 'joe'], as_index=False) def test_nunique(): df = DataFrame({ 'A': list('abbacc'), 'B': list('abxacc'), 'C': list('abbacx'), }) expected = DataFrame({'A': [1] * 3, 'B': [1, 2, 1], 'C': [1, 1, 2]}) result = df.groupby('A', as_index=False).nunique() tm.assert_frame_equal(result, expected) # as_index expected.index = list('abc') expected.index.name = 'A' result = df.groupby('A').nunique() tm.assert_frame_equal(result, expected) # with na result = df.replace({'x': None}).groupby('A').nunique(dropna=False) tm.assert_frame_equal(result, expected) # dropna expected = DataFrame({'A': [1] * 3, 'B': [1] * 3, 'C': [1] * 3}, index=list('abc')) expected.index.name = 'A' result = df.replace({'x': None}).groupby('A').nunique() tm.assert_frame_equal(result, expected) def test_nunique_with_object(): # GH 11077 data = pd.DataFrame( [[100, 1, 'Alice'], [200, 2, 'Bob'], [300, 3, 'Charlie'], [-400, 4, 'Dan'], [500, 5, 'Edith']], columns=['amount', 'id', 'name'] ) result = data.groupby(['id', 'amount'])['name'].nunique() index = MultiIndex.from_arrays([data.id, data.amount]) expected = pd.Series([1] * 5, name='name', index=index) tm.assert_series_equal(result, expected) def test_nunique_with_empty_series(): # GH 12553 data = pd.Series(name='name') result = data.groupby(level=0).nunique() expected = pd.Series(name='name', dtype='int64') tm.assert_series_equal(result, expected) def test_nunique_with_timegrouper(): # GH 13453 test = pd.DataFrame({ 'time': [Timestamp('2016-06-28 09:35:35'), Timestamp('2016-06-28 16:09:30'), Timestamp('2016-06-28 16:46:28')], 'data': ['1', '2', '3']}).set_index('time') result = test.groupby(pd.Grouper(freq='h'))['data'].nunique() expected = test.groupby( pd.Grouper(freq='h') )['data'].apply(pd.Series.nunique) tm.assert_series_equal(result, expected) # count # -------------------------------- def test_groupby_timedelta_cython_count(): df = DataFrame({'g': list('ab' * 2), 'delt': np.arange(4).astype('timedelta64[ns]')}) expected = Series([ 2, 2 ], index=pd.Index(['a', 'b'], name='g'), name='delt') result = df.groupby('g').delt.count() tm.assert_series_equal(expected, result) def test_count(): n = 1 << 15 dr = date_range('2015-08-30', periods=n // 10, freq='T') df = DataFrame({ '1st': np.random.choice( list(ascii_lowercase), n), '2nd': np.random.randint(0, 5, n), '3rd': np.random.randn(n).round(3), '4th': np.random.randint(-10, 10, n), '5th': np.random.choice(dr, n), '6th': np.random.randn(n).round(3), '7th': np.random.randn(n).round(3), '8th': np.random.choice(dr, n) - np.random.choice(dr, 1), '9th': np.random.choice( list(ascii_lowercase), n) }) for col in df.columns.drop(['1st', '2nd', '4th']): df.loc[np.random.choice(n, n // 10), col] = np.nan df['9th'] = df['9th'].astype('category') for key in '1st', '2nd', ['1st', '2nd']: left = df.groupby(key).count() right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1) tm.assert_frame_equal(left, right) # GH5610 # count counts non-nulls df = pd.DataFrame([[1, 2, 'foo'], [1, np.nan, 'bar'], [3, np.nan, np.nan]], columns=['A', 'B', 'C']) count_as = df.groupby('A').count() count_not_as = df.groupby('A', as_index=False).count() expected = DataFrame([[1, 2], [0, 0]], columns=['B', 'C'], index=[1, 3]) expected.index.name = 'A' tm.assert_frame_equal(count_not_as, expected.reset_index()) tm.assert_frame_equal(count_as, expected) count_B = df.groupby('A')['B'].count() tm.assert_series_equal(count_B, expected['B']) def test_count_object(): df = pd.DataFrame({'a': ['a'] * 3 + ['b'] * 3, 'c': [2] * 3 + [3] * 3}) result = df.groupby('c').a.count() expected = pd.Series([ 3, 3 ], index=pd.Index([2, 3], name='c'), name='a') tm.assert_series_equal(result, expected) df = pd.DataFrame({'a': ['a', np.nan, np.nan] + ['b'] * 3, 'c': [2] * 3 + [3] * 3}) result = df.groupby('c').a.count() expected = pd.Series([ 1, 3 ], index=pd.Index([2, 3], name='c'), name='a') tm.assert_series_equal(result, expected) def test_count_cross_type(): # GH8169 vals = np.hstack((np.random.randint(0, 5, (100, 2)), np.random.randint( 0, 2, (100, 2)))) df = pd.DataFrame(vals, columns=['a', 'b', 'c', 'd']) df[df == 2] = np.nan expected = df.groupby(['c', 'd']).count() for t in ['float32', 'object']: df['a'] = df['a'].astype(t) df['b'] = df['b'].astype(t) result = df.groupby(['c', 'd']).count() tm.assert_frame_equal(result, expected) def test_lower_int_prec_count(): df = DataFrame({'a': np.array( [0, 1, 2, 100], np.int8), 'b': np.array( [1, 2, 3, 6], np.uint32), 'c': np.array( [4, 5, 6, 8], np.int16), 'grp': list('ab' * 2)}) result = df.groupby('grp').count() expected = DataFrame({'a': [2, 2], 'b': [2, 2], 'c': [2, 2]}, index=pd.Index(list('ab'), name='grp')) tm.assert_frame_equal(result, expected) def test_count_uses_size_on_exception(): class RaisingObjectException(Exception): pass class RaisingObject(object): def __init__(self, msg='I will raise inside Cython'): super(RaisingObject, self).__init__() self.msg = msg def __eq__(self, other): # gets called in Cython to check that raising calls the method raise RaisingObjectException(self.msg) df = DataFrame({'a': [RaisingObject() for _ in range(4)], 'grp': list('ab' * 2)}) result = df.groupby('grp').count() expected = DataFrame({'a': [2, 2]}, index=pd.Index( list('ab'), name='grp')) tm.assert_frame_equal(result, expected) # size # -------------------------------- def test_size(df): grouped = df.groupby(['A', 'B']) result = grouped.size() for key, group in grouped: assert result[key] == len(group) grouped = df.groupby('A') result = grouped.size() for key, group in grouped: assert result[key] == len(group) grouped = df.groupby('B') result = grouped.size() for key, group in grouped: assert result[key] == len(group) df = DataFrame(np.random.choice(20, (1000, 3)), columns=list('abc')) for sort, key in cart_product((False, True), ('a', 'b', ['a', 'b'])): left = df.groupby(key, sort=sort).size() right = df.groupby(key, sort=sort)['c'].apply(lambda a: a.shape[0]) tm.assert_series_equal(left, right, check_names=False) # GH11699 df = DataFrame([], columns=['A', 'B']) out = Series([], dtype='int64', index=Index([], name='A')) tm.assert_series_equal(df.groupby('A').size(), out) # pipe # -------------------------------- def test_pipe(): # Test the pipe method of DataFrameGroupBy. # Issue #17871 random_state = np.random.RandomState(1234567890) df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': random_state.randn(8), 'C': random_state.randn(8)}) def f(dfgb): return dfgb.B.max() - dfgb.C.min().min() def square(srs): return srs ** 2 # Note that the transformations are # GroupBy -> Series # Series -> Series # This then chains the GroupBy.pipe and the # NDFrame.pipe methods result = df.groupby('A').pipe(f).pipe(square) index = Index([u'bar', u'foo'], dtype='object', name=u'A') expected = pd.Series([8.99110003361, 8.17516964785], name='B', index=index) tm.assert_series_equal(expected, result) def test_pipe_args(): # Test passing args to the pipe method of DataFrameGroupBy. # Issue #17871 df = pd.DataFrame({'group': ['A', 'A', 'B', 'B', 'C'], 'x': [1.0, 2.0, 3.0, 2.0, 5.0], 'y': [10.0, 100.0, 1000.0, -100.0, -1000.0]}) def f(dfgb, arg1): return (dfgb.filter(lambda grp: grp.y.mean() > arg1, dropna=False) .groupby(dfgb.grouper)) def g(dfgb, arg2): return dfgb.sum() / dfgb.sum().sum() + arg2 def h(df, arg3): return df.x + df.y - arg3 result = (df .groupby('group') .pipe(f, 0) .pipe(g, 10) .pipe(h, 100)) # Assert the results here index = pd.Index(['A', 'B', 'C'], name='group') expected = pd.Series([-79.5160891089, -78.4839108911, -80], index=index) tm.assert_series_equal(expected, result) # test SeriesGroupby.pipe ser = pd.Series([1, 1, 2, 2, 3, 3]) result = ser.groupby(ser).pipe(lambda grp: grp.sum() * grp.count()) expected = pd.Series([4, 8, 12], index=pd.Int64Index([1, 2, 3])) tm.assert_series_equal(result, expected)