laywerrobot/lib/python3.6/site-packages/pandas/tests/groupby/test_groupby.py
2020-08-27 21:55:39 +02:00

1695 lines
53 KiB
Python

# -*- coding: utf-8 -*-
from __future__ import print_function
import pytest
from warnings import catch_warnings
from datetime import datetime
from decimal import Decimal
from pandas import (date_range, Timestamp,
Index, MultiIndex, DataFrame, Series,
Panel, DatetimeIndex, read_csv)
from pandas.errors import PerformanceWarning
from pandas.util.testing import (assert_frame_equal,
assert_series_equal, assert_almost_equal)
from pandas.compat import (range, lrange, StringIO, lmap, lzip, map, zip,
OrderedDict)
from pandas import compat
from collections import defaultdict
import pandas.core.common as com
import numpy as np
import pandas.util.testing as tm
import pandas as pd
def test_repr():
# GH18203
result = repr(pd.Grouper(key='A', level='B'))
expected = "Grouper(key='A', level='B', axis=0, sort=False)"
assert result == expected
@pytest.mark.parametrize('dtype', ['int64', 'int32', 'float64', 'float32'])
def test_basic(dtype):
data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
index = np.arange(9)
np.random.shuffle(index)
data = data.reindex(index)
grouped = data.groupby(lambda x: x // 3)
for k, v in grouped:
assert len(v) == 3
agged = grouped.aggregate(np.mean)
assert agged[1] == 1
assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
assert_series_equal(agged, grouped.mean())
assert_series_equal(grouped.agg(np.sum), grouped.sum())
expected = grouped.apply(lambda x: x * x.sum())
transformed = grouped.transform(lambda x: x * x.sum())
assert transformed[7] == 12
assert_series_equal(transformed, expected)
value_grouped = data.groupby(data)
assert_series_equal(value_grouped.aggregate(np.mean), agged,
check_index_type=False)
# complex agg
agged = grouped.aggregate([np.mean, np.std])
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
agged = grouped.aggregate({'one': np.mean, 'two': np.std})
group_constants = {0: 10, 1: 20, 2: 30}
agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
assert agged[1] == 21
# corner cases
pytest.raises(Exception, grouped.aggregate, lambda x: x * 2)
def test_groupby_nonobject_dtype(mframe, df_mixed_floats):
key = mframe.index.labels[0]
grouped = mframe.groupby(key)
result = grouped.sum()
expected = mframe.groupby(key.astype('O')).sum()
assert_frame_equal(result, expected)
# GH 3911, mixed frame non-conversion
df = df_mixed_floats.copy()
df['value'] = lrange(len(df))
def max_value(group):
return group.loc[group['value'].idxmax()]
applied = df.groupby('A').apply(max_value)
result = applied.get_dtype_counts().sort_values()
expected = Series({'float64': 2,
'int64': 1,
'object': 2}).sort_values()
assert_series_equal(result, expected)
def test_groupby_return_type():
# GH2893, return a reduced type
df1 = DataFrame(
[{"val1": 1, "val2": 20},
{"val1": 1, "val2": 19},
{"val1": 2, "val2": 27},
{"val1": 2, "val2": 12}
])
def func(dataf):
return dataf["val2"] - dataf["val2"].mean()
result = df1.groupby("val1", squeeze=True).apply(func)
assert isinstance(result, Series)
df2 = DataFrame(
[{"val1": 1, "val2": 20},
{"val1": 1, "val2": 19},
{"val1": 1, "val2": 27},
{"val1": 1, "val2": 12}
])
def func(dataf):
return dataf["val2"] - dataf["val2"].mean()
result = df2.groupby("val1", squeeze=True).apply(func)
assert isinstance(result, Series)
# GH3596, return a consistent type (regression in 0.11 from 0.10.1)
df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y'])
result = df.groupby('X', squeeze=False).count()
assert isinstance(result, DataFrame)
# GH5592
# inconcistent return type
df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb',
'Pony', 'Pony'], B=Series(
np.arange(7), dtype='int64'), C=date_range(
'20130101', periods=7)))
def f(grp):
return grp.iloc[0]
expected = df.groupby('A').first()[['B']]
result = df.groupby('A').apply(f)[['B']]
assert_frame_equal(result, expected)
def f(grp):
if grp.name == 'Tiger':
return None
return grp.iloc[0]
result = df.groupby('A').apply(f)[['B']]
e = expected.copy()
e.loc['Tiger'] = np.nan
assert_frame_equal(result, e)
def f(grp):
if grp.name == 'Pony':
return None
return grp.iloc[0]
result = df.groupby('A').apply(f)[['B']]
e = expected.copy()
e.loc['Pony'] = np.nan
assert_frame_equal(result, e)
# 5592 revisited, with datetimes
def f(grp):
if grp.name == 'Pony':
return None
return grp.iloc[0]
result = df.groupby('A').apply(f)[['C']]
e = df.groupby('A').first()[['C']]
e.loc['Pony'] = pd.NaT
assert_frame_equal(result, e)
# scalar outputs
def f(grp):
if grp.name == 'Pony':
return None
return grp.iloc[0].loc['C']
result = df.groupby('A').apply(f)
e = df.groupby('A').first()['C'].copy()
e.loc['Pony'] = np.nan
e.name = None
assert_series_equal(result, e)
def test_pass_args_kwargs(ts, tsframe):
def f(x, q=None, axis=0):
return np.percentile(x, q, axis=axis)
g = lambda x: np.percentile(x, 80, axis=0)
# Series
ts_grouped = ts.groupby(lambda x: x.month)
agg_result = ts_grouped.agg(np.percentile, 80, axis=0)
apply_result = ts_grouped.apply(np.percentile, 80, axis=0)
trans_result = ts_grouped.transform(np.percentile, 80, axis=0)
agg_expected = ts_grouped.quantile(.8)
trans_expected = ts_grouped.transform(g)
assert_series_equal(apply_result, agg_expected)
assert_series_equal(agg_result, agg_expected, check_names=False)
assert_series_equal(trans_result, trans_expected)
agg_result = ts_grouped.agg(f, q=80)
apply_result = ts_grouped.apply(f, q=80)
trans_result = ts_grouped.transform(f, q=80)
assert_series_equal(agg_result, agg_expected)
assert_series_equal(apply_result, agg_expected)
assert_series_equal(trans_result, trans_expected)
# DataFrame
df_grouped = tsframe.groupby(lambda x: x.month)
agg_result = df_grouped.agg(np.percentile, 80, axis=0)
apply_result = df_grouped.apply(DataFrame.quantile, .8)
expected = df_grouped.quantile(.8)
assert_frame_equal(apply_result, expected)
assert_frame_equal(agg_result, expected, check_names=False)
agg_result = df_grouped.agg(f, q=80)
apply_result = df_grouped.apply(DataFrame.quantile, q=.8)
assert_frame_equal(agg_result, expected, check_names=False)
assert_frame_equal(apply_result, expected)
def test_len():
df = tm.makeTimeDataFrame()
grouped = df.groupby([lambda x: x.year, lambda x: x.month,
lambda x: x.day])
assert len(grouped) == len(df)
grouped = df.groupby([lambda x: x.year, lambda x: x.month])
expected = len({(x.year, x.month) for x in df.index})
assert len(grouped) == expected
# issue 11016
df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
assert len(df.groupby(('a'))) == 0
assert len(df.groupby(('b'))) == 3
assert len(df.groupby(['a', 'b'])) == 3
def test_basic_regression():
# regression
T = [1.0 * x for x in lrange(1, 10) * 10][:1095]
result = Series(T, lrange(0, len(T)))
groupings = np.random.random((1100, ))
groupings = Series(groupings, lrange(0, len(groupings))) * 10.
grouped = result.groupby(groupings)
grouped.mean()
@pytest.mark.parametrize('dtype', ['float64', 'float32', 'int64',
'int32', 'int16', 'int8'])
def test_with_na_groups(dtype):
index = Index(np.arange(10))
values = Series(np.ones(10), index, dtype=dtype)
labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan,
'bar', 'bar', np.nan, 'foo'], index=index)
# this SHOULD be an int
grouped = values.groupby(labels)
agged = grouped.agg(len)
expected = Series([4, 2], index=['bar', 'foo'])
assert_series_equal(agged, expected, check_dtype=False)
# assert issubclass(agged.dtype.type, np.integer)
# explicitly return a float from my function
def f(x):
return float(len(x))
agged = grouped.agg(f)
expected = Series([4, 2], index=['bar', 'foo'])
assert_series_equal(agged, expected, check_dtype=False)
assert issubclass(agged.dtype.type, np.dtype(dtype).type)
def test_indices_concatenation_order():
# GH 2808
def f1(x):
y = x[(x.b % 2) == 1] ** 2
if y.empty:
multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2,
names=['b', 'c'])
res = DataFrame(None, columns=['a'], index=multiindex)
return res
else:
y = y.set_index(['b', 'c'])
return y
def f2(x):
y = x[(x.b % 2) == 1] ** 2
if y.empty:
return DataFrame()
else:
y = y.set_index(['b', 'c'])
return y
def f3(x):
y = x[(x.b % 2) == 1] ** 2
if y.empty:
multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2,
names=['foo', 'bar'])
res = DataFrame(None, columns=['a', 'b'], index=multiindex)
return res
else:
return y
df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
# correct result
result1 = df.groupby('a').apply(f1)
result2 = df2.groupby('a').apply(f1)
assert_frame_equal(result1, result2)
# should fail (not the same number of levels)
pytest.raises(AssertionError, df.groupby('a').apply, f2)
pytest.raises(AssertionError, df2.groupby('a').apply, f2)
# should fail (incorrect shape)
pytest.raises(AssertionError, df.groupby('a').apply, f3)
pytest.raises(AssertionError, df2.groupby('a').apply, f3)
def test_attr_wrapper(ts):
grouped = ts.groupby(lambda x: x.weekday())
result = grouped.std()
expected = grouped.agg(lambda x: np.std(x, ddof=1))
assert_series_equal(result, expected)
# this is pretty cool
result = grouped.describe()
expected = {}
for name, gp in grouped:
expected[name] = gp.describe()
expected = DataFrame(expected).T
assert_frame_equal(result, expected)
# get attribute
result = grouped.dtype
expected = grouped.agg(lambda x: x.dtype)
# make sure raises error
pytest.raises(AttributeError, getattr, grouped, 'foo')
def test_frame_groupby(tsframe):
grouped = tsframe.groupby(lambda x: x.weekday())
# aggregate
aggregated = grouped.aggregate(np.mean)
assert len(aggregated) == 5
assert len(aggregated.columns) == 4
# by string
tscopy = tsframe.copy()
tscopy['weekday'] = [x.weekday() for x in tscopy.index]
stragged = tscopy.groupby('weekday').aggregate(np.mean)
assert_frame_equal(stragged, aggregated, check_names=False)
# transform
grouped = tsframe.head(30).groupby(lambda x: x.weekday())
transformed = grouped.transform(lambda x: x - x.mean())
assert len(transformed) == 30
assert len(transformed.columns) == 4
# transform propagate
transformed = grouped.transform(lambda x: x.mean())
for name, group in grouped:
mean = group.mean()
for idx in group.index:
tm.assert_series_equal(transformed.xs(idx), mean,
check_names=False)
# iterate
for weekday, group in grouped:
assert group.index[0].weekday() == weekday
# groups / group_indices
groups = grouped.groups
indices = grouped.indices
for k, v in compat.iteritems(groups):
samething = tsframe.index.take(indices[k])
assert (samething == v).all()
def test_frame_groupby_columns(tsframe):
mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1}
grouped = tsframe.groupby(mapping, axis=1)
# aggregate
aggregated = grouped.aggregate(np.mean)
assert len(aggregated) == len(tsframe)
assert len(aggregated.columns) == 2
# transform
tf = lambda x: x - x.mean()
groupedT = tsframe.T.groupby(mapping, axis=0)
assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
# iterate
for k, v in grouped:
assert len(v.columns) == 2
def test_frame_set_name_single(df):
grouped = df.groupby('A')
result = grouped.mean()
assert result.index.name == 'A'
result = df.groupby('A', as_index=False).mean()
assert result.index.name != 'A'
result = grouped.agg(np.mean)
assert result.index.name == 'A'
result = grouped.agg({'C': np.mean, 'D': np.std})
assert result.index.name == 'A'
result = grouped['C'].mean()
assert result.index.name == 'A'
result = grouped['C'].agg(np.mean)
assert result.index.name == 'A'
result = grouped['C'].agg([np.mean, np.std])
assert result.index.name == 'A'
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = grouped['C'].agg({'foo': np.mean, 'bar': np.std})
assert result.index.name == 'A'
def test_multi_func(df):
col1 = df['A']
col2 = df['B']
grouped = df.groupby([col1.get, col2.get])
agged = grouped.mean()
expected = df.groupby(['A', 'B']).mean()
# TODO groupby get drops names
assert_frame_equal(agged.loc[:, ['C', 'D']],
expected.loc[:, ['C', 'D']],
check_names=False)
# some "groups" with no data
df = DataFrame({'v1': np.random.randn(6),
'v2': np.random.randn(6),
'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
'k2': np.array(['1', '1', '1', '2', '2', '2'])},
index=['one', 'two', 'three', 'four', 'five', 'six'])
# only verify that it works for now
grouped = df.groupby(['k1', 'k2'])
grouped.agg(np.sum)
def test_multi_key_multiple_functions(df):
grouped = df.groupby(['A', 'B'])['C']
agged = grouped.agg([np.mean, np.std])
expected = DataFrame({'mean': grouped.agg(np.mean),
'std': grouped.agg(np.std)})
assert_frame_equal(agged, expected)
def test_frame_multi_key_function_list():
data = DataFrame(
{'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
'foo', 'foo', 'foo'],
'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
'two', 'two', 'one'],
'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
'dull', 'shiny', 'shiny', 'shiny'],
'D': np.random.randn(11),
'E': np.random.randn(11),
'F': np.random.randn(11)})
grouped = data.groupby(['A', 'B'])
funcs = [np.mean, np.std]
agged = grouped.agg(funcs)
expected = pd.concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs),
grouped['F'].agg(funcs)],
keys=['D', 'E', 'F'], axis=1)
assert (isinstance(agged.index, MultiIndex))
assert (isinstance(expected.index, MultiIndex))
assert_frame_equal(agged, expected)
@pytest.mark.parametrize('op', [lambda x: x.sum(), lambda x: x.mean()])
def test_groupby_multiple_columns(df, op):
data = df
grouped = data.groupby(['A', 'B'])
with catch_warnings(record=True):
result1 = op(grouped)
expected = defaultdict(dict)
for n1, gp1 in data.groupby('A'):
for n2, gp2 in gp1.groupby('B'):
expected[n1][n2] = op(gp2.loc[:, ['C', 'D']])
expected = dict((k, DataFrame(v))
for k, v in compat.iteritems(expected))
expected = Panel.fromDict(expected).swapaxes(0, 1)
expected.major_axis.name, expected.minor_axis.name = 'A', 'B'
# a little bit crude
for col in ['C', 'D']:
result_col = op(grouped[col])
exp = expected[col]
pivoted = result1[col].unstack()
pivoted2 = result_col.unstack()
assert_frame_equal(pivoted.reindex_like(exp), exp)
assert_frame_equal(pivoted2.reindex_like(exp), exp)
# test single series works the same
result = data['C'].groupby([data['A'], data['B']]).mean()
expected = data.groupby(['A', 'B']).mean()['C']
assert_series_equal(result, expected)
def test_groupby_as_index_agg(df):
grouped = df.groupby('A', as_index=False)
# single-key
result = grouped.agg(np.mean)
expected = grouped.mean()
assert_frame_equal(result, expected)
result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
expected2 = grouped.mean()
expected2['D'] = grouped.sum()['D']
assert_frame_equal(result2, expected2)
grouped = df.groupby('A', as_index=True)
expected3 = grouped['C'].sum()
expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result3 = grouped['C'].agg({'Q': np.sum})
assert_frame_equal(result3, expected3)
# multi-key
grouped = df.groupby(['A', 'B'], as_index=False)
result = grouped.agg(np.mean)
expected = grouped.mean()
assert_frame_equal(result, expected)
result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
expected2 = grouped.mean()
expected2['D'] = grouped.sum()['D']
assert_frame_equal(result2, expected2)
expected3 = grouped['C'].sum()
expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
result3 = grouped['C'].agg({'Q': np.sum})
assert_frame_equal(result3, expected3)
# GH7115 & GH8112 & GH8582
df = DataFrame(np.random.randint(0, 100, (50, 3)),
columns=['jim', 'joe', 'jolie'])
ts = Series(np.random.randint(5, 10, 50), name='jim')
gr = df.groupby(ts)
gr.nth(0) # invokes set_selection_from_grouper internally
assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))
for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']:
gr = df.groupby(ts, as_index=False)
left = getattr(gr, attr)()
gr = df.groupby(ts.values, as_index=True)
right = getattr(gr, attr)().reset_index(drop=True)
assert_frame_equal(left, right)
def test_as_index_series_return_frame(df):
grouped = df.groupby('A', as_index=False)
grouped2 = df.groupby(['A', 'B'], as_index=False)
result = grouped['C'].agg(np.sum)
expected = grouped.agg(np.sum).loc[:, ['A', 'C']]
assert isinstance(result, DataFrame)
assert_frame_equal(result, expected)
result2 = grouped2['C'].agg(np.sum)
expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']]
assert isinstance(result2, DataFrame)
assert_frame_equal(result2, expected2)
result = grouped['C'].sum()
expected = grouped.sum().loc[:, ['A', 'C']]
assert isinstance(result, DataFrame)
assert_frame_equal(result, expected)
result2 = grouped2['C'].sum()
expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']]
assert isinstance(result2, DataFrame)
assert_frame_equal(result2, expected2)
# corner case
pytest.raises(Exception, grouped['C'].__getitem__, 'D')
def test_groupby_as_index_cython(df):
data = df
# single-key
grouped = data.groupby('A', as_index=False)
result = grouped.mean()
expected = data.groupby(['A']).mean()
expected.insert(0, 'A', expected.index)
expected.index = np.arange(len(expected))
assert_frame_equal(result, expected)
# multi-key
grouped = data.groupby(['A', 'B'], as_index=False)
result = grouped.mean()
expected = data.groupby(['A', 'B']).mean()
arrays = lzip(*expected.index.values)
expected.insert(0, 'A', arrays[0])
expected.insert(1, 'B', arrays[1])
expected.index = np.arange(len(expected))
assert_frame_equal(result, expected)
def test_groupby_as_index_series_scalar(df):
grouped = df.groupby(['A', 'B'], as_index=False)
# GH #421
result = grouped['C'].agg(len)
expected = grouped.agg(len).loc[:, ['A', 'B', 'C']]
assert_frame_equal(result, expected)
def test_groupby_as_index_corner(df, ts):
pytest.raises(TypeError, ts.groupby, lambda x: x.weekday(),
as_index=False)
pytest.raises(ValueError, df.groupby, lambda x: x.lower(),
as_index=False, axis=1)
def test_groupby_multiple_key(df):
df = tm.makeTimeDataFrame()
grouped = df.groupby([lambda x: x.year, lambda x: x.month,
lambda x: x.day])
agged = grouped.sum()
assert_almost_equal(df.values, agged.values)
grouped = df.T.groupby([lambda x: x.year,
lambda x: x.month,
lambda x: x.day], axis=1)
agged = grouped.agg(lambda x: x.sum())
tm.assert_index_equal(agged.index, df.columns)
assert_almost_equal(df.T.values, agged.values)
agged = grouped.agg(lambda x: x.sum())
assert_almost_equal(df.T.values, agged.values)
def test_groupby_multi_corner(df):
# test that having an all-NA column doesn't mess you up
df = df.copy()
df['bad'] = np.nan
agged = df.groupby(['A', 'B']).mean()
expected = df.groupby(['A', 'B']).mean()
expected['bad'] = np.nan
assert_frame_equal(agged, expected)
def test_omit_nuisance(df):
grouped = df.groupby('A')
result = grouped.mean()
expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean()
assert_frame_equal(result, expected)
agged = grouped.agg(np.mean)
exp = grouped.mean()
assert_frame_equal(agged, exp)
df = df.loc[:, ['A', 'C', 'D']]
df['E'] = datetime.now()
grouped = df.groupby('A')
result = grouped.agg(np.sum)
expected = grouped.sum()
assert_frame_equal(result, expected)
# won't work with axis = 1
grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1)
result = pytest.raises(TypeError, grouped.agg,
lambda x: x.sum(0, numeric_only=False))
def test_omit_nuisance_python_multiple(three_group):
grouped = three_group.groupby(['A', 'B'])
agged = grouped.agg(np.mean)
exp = grouped.mean()
assert_frame_equal(agged, exp)
def test_empty_groups_corner(mframe):
# handle empty groups
df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
'k2': np.array(['1', '1', '1', '2', '2', '2']),
'k3': ['foo', 'bar'] * 3,
'v1': np.random.randn(6),
'v2': np.random.randn(6)})
grouped = df.groupby(['k1', 'k2'])
result = grouped.agg(np.mean)
expected = grouped.mean()
assert_frame_equal(result, expected)
grouped = mframe[3:5].groupby(level=0)
agged = grouped.apply(lambda x: x.mean())
agged_A = grouped['A'].apply(np.mean)
assert_series_equal(agged['A'], agged_A)
assert agged.index.name == 'first'
def test_nonsense_func():
df = DataFrame([0])
pytest.raises(Exception, df.groupby, lambda x: x + 'foo')
def test_wrap_aggregated_output_multindex(mframe):
df = mframe.T
df['baz', 'two'] = 'peekaboo'
keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
agged = df.groupby(keys).agg(np.mean)
assert isinstance(agged.columns, MultiIndex)
def aggfun(ser):
if ser.name == ('foo', 'one'):
raise TypeError
else:
return ser.sum()
agged2 = df.groupby(keys).aggregate(aggfun)
assert len(agged2.columns) + 1 == len(df.columns)
def test_groupby_level_apply(mframe):
result = mframe.groupby(level=0).count()
assert result.index.name == 'first'
result = mframe.groupby(level=1).count()
assert result.index.name == 'second'
result = mframe['A'].groupby(level=0).count()
assert result.index.name == 'first'
def test_groupby_level_mapper(mframe):
deleveled = mframe.reset_index()
mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1}
mapper1 = {'one': 0, 'two': 0, 'three': 1}
result0 = mframe.groupby(mapper0, level=0).sum()
result1 = mframe.groupby(mapper1, level=1).sum()
mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']])
mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']])
expected0 = mframe.groupby(mapped_level0).sum()
expected1 = mframe.groupby(mapped_level1).sum()
expected0.index.name, expected1.index.name = 'first', 'second'
assert_frame_equal(result0, expected0)
assert_frame_equal(result1, expected1)
def test_groupby_level_nonmulti():
# GH 1313, GH 13901
s = Series([1, 2, 3, 10, 4, 5, 20, 6],
Index([1, 2, 3, 1, 4, 5, 2, 6], name='foo'))
expected = Series([11, 22, 3, 4, 5, 6],
Index(range(1, 7), name='foo'))
result = s.groupby(level=0).sum()
tm.assert_series_equal(result, expected)
result = s.groupby(level=[0]).sum()
tm.assert_series_equal(result, expected)
result = s.groupby(level=-1).sum()
tm.assert_series_equal(result, expected)
result = s.groupby(level=[-1]).sum()
tm.assert_series_equal(result, expected)
pytest.raises(ValueError, s.groupby, level=1)
pytest.raises(ValueError, s.groupby, level=-2)
pytest.raises(ValueError, s.groupby, level=[])
pytest.raises(ValueError, s.groupby, level=[0, 0])
pytest.raises(ValueError, s.groupby, level=[0, 1])
pytest.raises(ValueError, s.groupby, level=[1])
def test_groupby_complex():
# GH 12902
a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1])
expected = Series((1 + 2j, 5 + 10j))
result = a.groupby(level=0).sum()
assert_series_equal(result, expected)
result = a.sum(level=0)
assert_series_equal(result, expected)
def test_mutate_groups():
# GH3380
df = DataFrame({
'cat1': ['a'] * 8 + ['b'] * 6,
'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 +
['d'] * 2 + ['e'] * 2,
'cat3': lmap(lambda x: 'g%s' % x, lrange(1, 15)),
'val': np.random.randint(100, size=14),
})
def f_copy(x):
x = x.copy()
x['rank'] = x.val.rank(method='min')
return x.groupby('cat2')['rank'].min()
def f_no_copy(x):
x['rank'] = x.val.rank(method='min')
return x.groupby('cat2')['rank'].min()
grpby_copy = df.groupby('cat1').apply(f_copy)
grpby_no_copy = df.groupby('cat1').apply(f_no_copy)
assert_series_equal(grpby_copy, grpby_no_copy)
def test_no_mutate_but_looks_like():
# GH 8467
# first show's mutation indicator
# second does not, but should yield the same results
df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)})
result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key)
result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key)
assert_series_equal(result1, result2)
def test_groupby_series_indexed_differently():
s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7],
index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g']))
s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0],
index=Index(['a', 'b', 'd', 'f', 'g', 'h']))
grouped = s1.groupby(s2)
agged = grouped.mean()
exp = s1.groupby(s2.reindex(s1.index).get).mean()
assert_series_equal(agged, exp)
def test_groupby_with_hier_columns():
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux',
'qux'], ['one', 'two', 'one', 'two', 'one', 'two',
'one', 'two']]))
index = MultiIndex.from_tuples(tuples)
columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), (
'B', 'cat'), ('A', 'dog')])
df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
result = df.groupby(level=0).mean()
tm.assert_index_equal(result.columns, columns)
result = df.groupby(level=0, axis=1).mean()
tm.assert_index_equal(result.index, df.index)
result = df.groupby(level=0).agg(np.mean)
tm.assert_index_equal(result.columns, columns)
result = df.groupby(level=0).apply(lambda x: x.mean())
tm.assert_index_equal(result.columns, columns)
result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
tm.assert_index_equal(result.columns, Index(['A', 'B']))
tm.assert_index_equal(result.index, df.index)
# add a nuisance column
sorted_columns, _ = columns.sortlevel(0)
df['A', 'foo'] = 'bar'
result = df.groupby(level=0).mean()
tm.assert_index_equal(result.columns, df.columns[:-1])
def test_grouping_ndarray(df):
grouped = df.groupby(df['A'].values)
result = grouped.sum()
expected = df.groupby('A').sum()
assert_frame_equal(result, expected, check_names=False
) # Note: no names when grouping by value
def test_groupby_wrong_multi_labels():
data = """index,foo,bar,baz,spam,data
0,foo1,bar1,baz1,spam2,20
1,foo1,bar2,baz1,spam3,30
2,foo2,bar2,baz1,spam2,40
3,foo1,bar1,baz2,spam1,50
4,foo3,bar1,baz2,spam1,60"""
data = read_csv(StringIO(data), index_col=0)
grouped = data.groupby(['foo', 'bar', 'baz', 'spam'])
result = grouped.agg(np.mean)
expected = grouped.mean()
assert_frame_equal(result, expected)
def test_groupby_series_with_name(df):
result = df.groupby(df['A']).mean()
result2 = df.groupby(df['A'], as_index=False).mean()
assert result.index.name == 'A'
assert 'A' in result2
result = df.groupby([df['A'], df['B']]).mean()
result2 = df.groupby([df['A'], df['B']],
as_index=False).mean()
assert result.index.names == ('A', 'B')
assert 'A' in result2
assert 'B' in result2
def test_seriesgroupby_name_attr(df):
# GH 6265
result = df.groupby('A')['C']
assert result.count().name == 'C'
assert result.mean().name == 'C'
testFunc = lambda x: np.sum(x) * 2
assert result.agg(testFunc).name == 'C'
def test_consistency_name():
# GH 12363
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'two',
'two', 'two', 'one', 'two'],
'C': np.random.randn(8) + 1.0,
'D': np.arange(8)})
expected = df.groupby(['A']).B.count()
result = df.B.groupby(df.A).count()
assert_series_equal(result, expected)
def test_groupby_name_propagation(df):
# GH 6124
def summarize(df, name=None):
return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name)
def summarize_random_name(df):
# Provide a different name for each Series. In this case, groupby
# should not attempt to propagate the Series name since they are
# inconsistent.
return Series({
'count': 1,
'mean': 2,
'omissions': 3,
}, name=df.iloc[0]['A'])
metrics = df.groupby('A').apply(summarize)
assert metrics.columns.name is None
metrics = df.groupby('A').apply(summarize, 'metrics')
assert metrics.columns.name == 'metrics'
metrics = df.groupby('A').apply(summarize_random_name)
assert metrics.columns.name is None
def test_groupby_nonstring_columns():
df = DataFrame([np.arange(10) for x in range(10)])
grouped = df.groupby(0)
result = grouped.mean()
expected = df.groupby(df[0]).mean()
assert_frame_equal(result, expected)
def test_groupby_mixed_type_columns():
# GH 13432, unorderable types in py3
df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0])
expected = DataFrame([[1, 2]], columns=['B', 0],
index=Index([0], name='A'))
result = df.groupby('A').first()
tm.assert_frame_equal(result, expected)
result = df.groupby('A').sum()
tm.assert_frame_equal(result, expected)
def test_cython_grouper_series_bug_noncontig():
arr = np.empty((100, 100))
arr.fill(np.nan)
obj = Series(arr[:, 0], index=lrange(100))
inds = np.tile(lrange(10), 10)
result = obj.groupby(inds).agg(Series.median)
assert result.isna().all()
def test_series_grouper_noncontig_index():
index = Index(tm.rands_array(10, 100))
values = Series(np.random.randn(50), index=index[::2])
labels = np.random.randint(0, 5, 50)
# it works!
grouped = values.groupby(labels)
# accessing the index elements causes segfault
f = lambda x: len(set(map(id, x.index)))
grouped.agg(f)
def test_convert_objects_leave_decimal_alone():
s = Series(lrange(5))
labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O')
def convert_fast(x):
return Decimal(str(x.mean()))
def convert_force_pure(x):
# base will be length 0
assert (len(x.values.base) > 0)
return Decimal(str(x.mean()))
grouped = s.groupby(labels)
result = grouped.agg(convert_fast)
assert result.dtype == np.object_
assert isinstance(result[0], Decimal)
result = grouped.agg(convert_force_pure)
assert result.dtype == np.object_
assert isinstance(result[0], Decimal)
def test_groupby_dtype_inference_empty():
# GH 6733
df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')})
assert df['x'].dtype == np.float64
result = df.groupby('x').first()
exp_index = Index([], name='x', dtype=np.float64)
expected = DataFrame({'range': Series(
[], index=exp_index, dtype='int64')})
assert_frame_equal(result, expected, by_blocks=True)
def test_groupby_list_infer_array_like(df):
result = df.groupby(list(df['A'])).mean()
expected = df.groupby(df['A']).mean()
assert_frame_equal(result, expected, check_names=False)
pytest.raises(Exception, df.groupby, list(df['A'][:-1]))
# pathological case of ambiguity
df = DataFrame({'foo': [0, 1],
'bar': [3, 4],
'val': np.random.randn(2)})
result = df.groupby(['foo', 'bar']).mean()
expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]
def test_groupby_keys_same_size_as_index():
# GH 11185
freq = 's'
index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'),
periods=2, freq=freq)
df = pd.DataFrame([['A', 10], ['B', 15]], columns=[
'metric', 'values'
], index=index)
result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean()
expected = df.set_index([df.index, 'metric'])
assert_frame_equal(result, expected)
def test_groupby_one_row():
# GH 11741
df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD'))
pytest.raises(KeyError, df1.groupby, 'Z')
df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD'))
pytest.raises(KeyError, df2.groupby, 'Z')
def test_groupby_nat_exclude():
# GH 6992
df = pd.DataFrame(
{'values': np.random.randn(8),
'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp(
'2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan,
pd.Timestamp('2013-01-01')],
'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']})
grouped = df.groupby('dt')
expected = [pd.Index([1, 7]), pd.Index([3, 5])]
keys = sorted(grouped.groups.keys())
assert len(keys) == 2
for k, e in zip(keys, expected):
# grouped.groups keys are np.datetime64 with system tz
# not to be affected by tz, only compare values
tm.assert_index_equal(grouped.groups[k], e)
# confirm obj is not filtered
tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
assert grouped.ngroups == 2
expected = {
Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64),
Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64)
}
for k in grouped.indices:
tm.assert_numpy_array_equal(grouped.indices[k], expected[k])
tm.assert_frame_equal(
grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]])
tm.assert_frame_equal(
grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]])
pytest.raises(KeyError, grouped.get_group, pd.NaT)
nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan],
'nat': [pd.NaT, pd.NaT, pd.NaT]})
assert nan_df['nan'].dtype == 'float64'
assert nan_df['nat'].dtype == 'datetime64[ns]'
for key in ['nan', 'nat']:
grouped = nan_df.groupby(key)
assert grouped.groups == {}
assert grouped.ngroups == 0
assert grouped.indices == {}
pytest.raises(KeyError, grouped.get_group, np.nan)
pytest.raises(KeyError, grouped.get_group, pd.NaT)
def test_sparse_friendly(df):
sdf = df[['C', 'D']].to_sparse()
with catch_warnings(record=True):
panel = tm.makePanel()
tm.add_nans(panel)
def _check_work(gp):
gp.mean()
gp.agg(np.mean)
dict(iter(gp))
# it works!
_check_work(sdf.groupby(lambda x: x // 2))
_check_work(sdf['C'].groupby(lambda x: x // 2))
_check_work(sdf.groupby(df['A']))
# do this someday
# _check_work(panel.groupby(lambda x: x.month, axis=1))
def test_panel_groupby():
with catch_warnings(record=True):
panel = tm.makePanel()
tm.add_nans(panel)
grouped = panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1},
axis='items')
agged = grouped.mean()
agged2 = grouped.agg(lambda x: x.mean('items'))
tm.assert_panel_equal(agged, agged2)
tm.assert_index_equal(agged.items, Index([0, 1]))
grouped = panel.groupby(lambda x: x.month, axis='major')
agged = grouped.mean()
exp = Index(sorted(list(set(panel.major_axis.month))))
tm.assert_index_equal(agged.major_axis, exp)
grouped = panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
axis='minor')
agged = grouped.mean()
tm.assert_index_equal(agged.minor_axis, Index([0, 1]))
def test_groupby_2d_malformed():
d = DataFrame(index=lrange(2))
d['group'] = ['g1', 'g2']
d['zeros'] = [0, 0]
d['ones'] = [1, 1]
d['label'] = ['l1', 'l2']
tmp = d.groupby(['group']).mean()
res_values = np.array([[0, 1], [0, 1]], dtype=np.int64)
tm.assert_index_equal(tmp.columns, Index(['zeros', 'ones']))
tm.assert_numpy_array_equal(tmp.values, res_values)
def test_int32_overflow():
B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000)
))
A = np.arange(25000)
df = DataFrame({'A': A,
'B': B,
'C': A,
'D': B,
'E': np.random.randn(25000)})
left = df.groupby(['A', 'B', 'C', 'D']).sum()
right = df.groupby(['D', 'C', 'B', 'A']).sum()
assert len(left) == len(right)
def test_groupby_sort_multi():
df = DataFrame({'a': ['foo', 'bar', 'baz'],
'b': [3, 2, 1],
'c': [0, 1, 2],
'd': np.random.randn(3)})
tups = lmap(tuple, df[['a', 'b', 'c']].values)
tups = com._asarray_tuplesafe(tups)
result = df.groupby(['a', 'b', 'c'], sort=True).sum()
tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]])
tups = lmap(tuple, df[['c', 'a', 'b']].values)
tups = com._asarray_tuplesafe(tups)
result = df.groupby(['c', 'a', 'b'], sort=True).sum()
tm.assert_numpy_array_equal(result.index.values, tups)
tups = lmap(tuple, df[['b', 'c', 'a']].values)
tups = com._asarray_tuplesafe(tups)
result = df.groupby(['b', 'c', 'a'], sort=True).sum()
tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]])
df = DataFrame({'a': [0, 1, 2, 0, 1, 2],
'b': [0, 0, 0, 1, 1, 1],
'd': np.random.randn(6)})
grouped = df.groupby(['a', 'b'])['d']
result = grouped.sum()
def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
tups = lmap(tuple, df[keys].values)
tups = com._asarray_tuplesafe(tups)
expected = f(df.groupby(tups)[field])
for k, v in compat.iteritems(expected):
assert (result[k] == v)
_check_groupby(df, result, ['a', 'b'], 'd')
def test_dont_clobber_name_column():
df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'],
'name': ['foo', 'bar', 'baz'] * 2})
result = df.groupby('key').apply(lambda x: x)
assert_frame_equal(result, df)
def test_skip_group_keys():
tsf = tm.makeTimeDataFrame()
grouped = tsf.groupby(lambda x: x.month, group_keys=False)
result = grouped.apply(lambda x: x.sort_values(by='A')[:3])
pieces = []
for key, group in grouped:
pieces.append(group.sort_values(by='A')[:3])
expected = pd.concat(pieces)
assert_frame_equal(result, expected)
grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False)
result = grouped.apply(lambda x: x.sort_values()[:3])
pieces = []
for key, group in grouped:
pieces.append(group.sort_values()[:3])
expected = pd.concat(pieces)
assert_series_equal(result, expected)
def test_no_nonsense_name(frame):
# GH #995
s = frame['C'].copy()
s.name = None
result = s.groupby(frame['A']).agg(np.sum)
assert result.name is None
def test_multifunc_sum_bug():
# GH #1065
x = DataFrame(np.arange(9).reshape(3, 3))
x['test'] = 0
x['fl'] = [1.3, 1.5, 1.6]
grouped = x.groupby('test')
result = grouped.agg({'fl': 'sum', 2: 'size'})
assert result['fl'].dtype == np.float64
def test_handle_dict_return_value(df):
def f(group):
return {'max': group.max(), 'min': group.min()}
def g(group):
return Series({'max': group.max(), 'min': group.min()})
result = df.groupby('A')['C'].apply(f)
expected = df.groupby('A')['C'].apply(g)
assert isinstance(result, Series)
assert_series_equal(result, expected)
@pytest.mark.parametrize('grouper', ['A', ['A', 'B']])
def test_set_group_name(df, grouper):
def f(group):
assert group.name is not None
return group
def freduce(group):
assert group.name is not None
return group.sum()
def foo(x):
return freduce(x)
grouped = df.groupby(grouper)
# make sure all these work
grouped.apply(f)
grouped.aggregate(freduce)
grouped.aggregate({'C': freduce, 'D': freduce})
grouped.transform(f)
grouped['C'].apply(f)
grouped['C'].aggregate(freduce)
grouped['C'].aggregate([freduce, foo])
grouped['C'].transform(f)
def test_group_name_available_in_inference_pass():
# gh-15062
df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
names = []
def f(group):
names.append(group.name)
return group.copy()
df.groupby('a', sort=False, group_keys=False).apply(f)
# we expect 2 zeros because we call ``f`` once to see if a faster route
# can be used.
expected_names = [0, 0, 1, 2]
assert names == expected_names
def test_no_dummy_key_names(df):
# see gh-1291
result = df.groupby(df['A'].values).sum()
assert result.index.name is None
result = df.groupby([df['A'].values, df['B'].values]).sum()
assert result.index.names == (None, None)
def test_groupby_sort_multiindex_series():
# series multiindex groupby sort argument was not being passed through
# _compress_group_index
# GH 9444
index = MultiIndex(levels=[[1, 2], [1, 2]],
labels=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]],
names=['a', 'b'])
mseries = Series([0, 1, 2, 3, 4, 5], index=index)
index = MultiIndex(levels=[[1, 2], [1, 2]],
labels=[[0, 0, 1], [1, 0, 0]], names=['a', 'b'])
mseries_result = Series([0, 2, 4], index=index)
result = mseries.groupby(level=['a', 'b'], sort=False).first()
assert_series_equal(result, mseries_result)
result = mseries.groupby(level=['a', 'b'], sort=True).first()
assert_series_equal(result, mseries_result.sort_index())
def test_groupby_reindex_inside_function():
periods = 1000
ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods)
df = DataFrame({'high': np.arange(
periods), 'low': np.arange(periods)}, index=ind)
def agg_before(hour, func, fix=False):
"""
Run an aggregate func on the subset of data.
"""
def _func(data):
d = data.loc[data.index.map(
lambda x: x.hour < 11)].dropna()
if fix:
data[data.index[0]]
if len(d) == 0:
return None
return func(d)
return _func
def afunc(data):
d = data.select(lambda x: x.hour < 11).dropna()
return np.max(d)
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
closure_bad = grouped.agg({'high': agg_before(11, np.max)})
closure_good = grouped.agg({'high': agg_before(11, np.max, True)})
assert_frame_equal(closure_bad, closure_good)
def test_groupby_multiindex_missing_pair():
# GH9049
df = DataFrame({'group1': ['a', 'a', 'a', 'b'],
'group2': ['c', 'c', 'd', 'c'],
'value': [1, 1, 1, 5]})
df = df.set_index(['group1', 'group2'])
df_grouped = df.groupby(level=['group1', 'group2'], sort=True)
res = df_grouped.agg('sum')
idx = MultiIndex.from_tuples(
[('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2'])
exp = DataFrame([[2], [1], [5]], index=idx, columns=['value'])
tm.assert_frame_equal(res, exp)
def test_groupby_multiindex_not_lexsorted():
# GH 11640
# define the lexsorted version
lexsorted_mi = MultiIndex.from_tuples(
[('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
assert lexsorted_df.columns.is_lexsorted()
# define the non-lexsorted version
not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
data=[[1, 'b1', 'c1', 3],
[1, 'b2', 'c2', 4]])
not_lexsorted_df = not_lexsorted_df.pivot_table(
index='a', columns=['b', 'c'], values='d')
not_lexsorted_df = not_lexsorted_df.reset_index()
assert not not_lexsorted_df.columns.is_lexsorted()
# compare the results
tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
expected = lexsorted_df.groupby('a').mean()
with tm.assert_produces_warning(PerformanceWarning):
result = not_lexsorted_df.groupby('a').mean()
tm.assert_frame_equal(expected, result)
# a transforming function should work regardless of sort
# GH 14776
df = DataFrame({'x': ['a', 'a', 'b', 'a'],
'y': [1, 1, 2, 2],
'z': [1, 2, 3, 4]}).set_index(['x', 'y'])
assert not df.index.is_lexsorted()
for level in [0, 1, [0, 1]]:
for sort in [False, True]:
result = df.groupby(level=level, sort=sort).apply(
DataFrame.drop_duplicates)
expected = df
tm.assert_frame_equal(expected, result)
result = df.sort_index().groupby(level=level, sort=sort).apply(
DataFrame.drop_duplicates)
expected = df.sort_index()
tm.assert_frame_equal(expected, result)
def test_index_label_overlaps_location():
# checking we don't have any label/location confusion in the
# the wake of GH5375
df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1])
g = df.groupby(list('ababb'))
actual = g.filter(lambda x: len(x) > 2)
expected = df.iloc[[1, 3, 4]]
assert_frame_equal(actual, expected)
ser = df[0]
g = ser.groupby(list('ababb'))
actual = g.filter(lambda x: len(x) > 2)
expected = ser.take([1, 3, 4])
assert_series_equal(actual, expected)
# ... and again, with a generic Index of floats
df.index = df.index.astype(float)
g = df.groupby(list('ababb'))
actual = g.filter(lambda x: len(x) > 2)
expected = df.iloc[[1, 3, 4]]
assert_frame_equal(actual, expected)
ser = df[0]
g = ser.groupby(list('ababb'))
actual = g.filter(lambda x: len(x) > 2)
expected = ser.take([1, 3, 4])
assert_series_equal(actual, expected)
def test_transform_doesnt_clobber_ints():
# GH 7972
n = 6
x = np.arange(n)
df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x})
df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x})
gb = df.groupby('a')
result = gb.transform('mean')
gb2 = df2.groupby('a')
expected = gb2.transform('mean')
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize('sort_column', ['ints', 'floats', 'strings',
['ints', 'floats'],
['ints', 'strings']])
@pytest.mark.parametrize('group_column', ['int_groups', 'string_groups',
['int_groups', 'string_groups']])
def test_groupby_preserves_sort(sort_column, group_column):
# Test to ensure that groupby always preserves sort order of original
# object. Issue #8588 and #9651
df = DataFrame(
{'int_groups': [3, 1, 0, 1, 0, 3, 3, 3],
'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'],
'ints': [8, 7, 4, 5, 2, 9, 1, 1],
'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5],
'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']})
# Try sorting on different types and with different group types
df = df.sort_values(by=sort_column)
g = df.groupby(group_column)
def test_sort(x):
assert_frame_equal(x, x.sort_values(by=sort_column))
g.apply(test_sort)
def test_group_shift_with_null_key():
# This test is designed to replicate the segfault in issue #13813.
n_rows = 1200
# Generate a moderately large dataframe with occasional missing
# values in column `B`, and then group by [`A`, `B`]. This should
# force `-1` in `labels` array of `g.grouper.group_info` exactly
# at those places, where the group-by key is partially missing.
df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i)
for i in range(n_rows)], dtype=float,
columns=["A", "B", "Z"], index=None)
g = df.groupby(["A", "B"])
expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12
else np.nan)
for i in range(n_rows)], dtype=float,
columns=["Z"], index=None)
result = g.shift(-1)
assert_frame_equal(result, expected)
def test_pivot_table_values_key_error():
# This test is designed to replicate the error in issue #14938
df = pd.DataFrame({'eventDate':
pd.date_range(pd.datetime.today(),
periods=20, freq='M').tolist(),
'thename': range(0, 20)})
df['year'] = df.set_index('eventDate').index.year
df['month'] = df.set_index('eventDate').index.month
with pytest.raises(KeyError):
df.reset_index().pivot_table(index='year', columns='month',
values='badname', aggfunc='count')
def test_empty_dataframe_groupby():
# GH8093
df = DataFrame(columns=['A', 'B', 'C'])
result = df.groupby('A').sum()
expected = DataFrame(columns=['B', 'C'], dtype=np.float64)
expected.index.name = 'A'
assert_frame_equal(result, expected)
def test_tuple_warns():
# https://github.com/pandas-dev/pandas/issues/18314
df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2],
'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]})
with tm.assert_produces_warning(FutureWarning) as w:
df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean()
assert "Interpreting tuple 'by' as a list" in str(w[0].message)
with tm.assert_produces_warning(None):
df.groupby(('a', 'b')).c.mean()
def test_tuple_warns_unhashable():
# https://github.com/pandas-dev/pandas/issues/18314
business_dates = date_range(start='4/1/2014', end='6/30/2014',
freq='B')
df = DataFrame(1, index=business_dates, columns=['a', 'b'])
with tm.assert_produces_warning(FutureWarning) as w:
df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])
assert "Interpreting tuple 'by' as a list" in str(w[0].message)
def test_tuple_correct_keyerror():
# https://github.com/pandas-dev/pandas/issues/18798
df = pd.DataFrame(1, index=range(3),
columns=pd.MultiIndex.from_product([[1, 2],
[3, 4]]))
with tm.assert_raises_regex(KeyError, "(7, 8)"):
df.groupby((7, 8)).mean()
def test_groupby_agg_ohlc_non_first():
# GH 21716
df = pd.DataFrame([[1], [1]], columns=['foo'],
index=pd.date_range('2018-01-01', periods=2, freq='D'))
expected = pd.DataFrame([
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1]
], columns=pd.MultiIndex.from_tuples((
('foo', 'ohlc', 'open'), ('foo', 'ohlc', 'high'),
('foo', 'ohlc', 'low'), ('foo', 'ohlc', 'close'),
('foo', 'sum', 'foo'))), index=pd.date_range(
'2018-01-01', periods=2, freq='D'))
result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc'])
tm.assert_frame_equal(result, expected)