1695 lines
53 KiB
Python
1695 lines
53 KiB
Python
# -*- coding: utf-8 -*-
|
|
from __future__ import print_function
|
|
|
|
import pytest
|
|
|
|
from warnings import catch_warnings
|
|
from datetime import datetime
|
|
from decimal import Decimal
|
|
|
|
from pandas import (date_range, Timestamp,
|
|
Index, MultiIndex, DataFrame, Series,
|
|
Panel, DatetimeIndex, read_csv)
|
|
from pandas.errors import PerformanceWarning
|
|
from pandas.util.testing import (assert_frame_equal,
|
|
assert_series_equal, assert_almost_equal)
|
|
from pandas.compat import (range, lrange, StringIO, lmap, lzip, map, zip,
|
|
OrderedDict)
|
|
from pandas import compat
|
|
from collections import defaultdict
|
|
import pandas.core.common as com
|
|
import numpy as np
|
|
|
|
import pandas.util.testing as tm
|
|
import pandas as pd
|
|
|
|
|
|
def test_repr():
|
|
# GH18203
|
|
result = repr(pd.Grouper(key='A', level='B'))
|
|
expected = "Grouper(key='A', level='B', axis=0, sort=False)"
|
|
assert result == expected
|
|
|
|
|
|
@pytest.mark.parametrize('dtype', ['int64', 'int32', 'float64', 'float32'])
|
|
def test_basic(dtype):
|
|
|
|
data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
|
|
|
|
index = np.arange(9)
|
|
np.random.shuffle(index)
|
|
data = data.reindex(index)
|
|
|
|
grouped = data.groupby(lambda x: x // 3)
|
|
|
|
for k, v in grouped:
|
|
assert len(v) == 3
|
|
|
|
agged = grouped.aggregate(np.mean)
|
|
assert agged[1] == 1
|
|
|
|
assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
|
|
assert_series_equal(agged, grouped.mean())
|
|
assert_series_equal(grouped.agg(np.sum), grouped.sum())
|
|
|
|
expected = grouped.apply(lambda x: x * x.sum())
|
|
transformed = grouped.transform(lambda x: x * x.sum())
|
|
assert transformed[7] == 12
|
|
assert_series_equal(transformed, expected)
|
|
|
|
value_grouped = data.groupby(data)
|
|
assert_series_equal(value_grouped.aggregate(np.mean), agged,
|
|
check_index_type=False)
|
|
|
|
# complex agg
|
|
agged = grouped.aggregate([np.mean, np.std])
|
|
|
|
with tm.assert_produces_warning(FutureWarning,
|
|
check_stacklevel=False):
|
|
agged = grouped.aggregate({'one': np.mean, 'two': np.std})
|
|
|
|
group_constants = {0: 10, 1: 20, 2: 30}
|
|
agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
|
|
assert agged[1] == 21
|
|
|
|
# corner cases
|
|
pytest.raises(Exception, grouped.aggregate, lambda x: x * 2)
|
|
|
|
|
|
def test_groupby_nonobject_dtype(mframe, df_mixed_floats):
|
|
key = mframe.index.labels[0]
|
|
grouped = mframe.groupby(key)
|
|
result = grouped.sum()
|
|
|
|
expected = mframe.groupby(key.astype('O')).sum()
|
|
assert_frame_equal(result, expected)
|
|
|
|
# GH 3911, mixed frame non-conversion
|
|
df = df_mixed_floats.copy()
|
|
df['value'] = lrange(len(df))
|
|
|
|
def max_value(group):
|
|
return group.loc[group['value'].idxmax()]
|
|
|
|
applied = df.groupby('A').apply(max_value)
|
|
result = applied.get_dtype_counts().sort_values()
|
|
expected = Series({'float64': 2,
|
|
'int64': 1,
|
|
'object': 2}).sort_values()
|
|
assert_series_equal(result, expected)
|
|
|
|
|
|
def test_groupby_return_type():
|
|
|
|
# GH2893, return a reduced type
|
|
df1 = DataFrame(
|
|
[{"val1": 1, "val2": 20},
|
|
{"val1": 1, "val2": 19},
|
|
{"val1": 2, "val2": 27},
|
|
{"val1": 2, "val2": 12}
|
|
])
|
|
|
|
def func(dataf):
|
|
return dataf["val2"] - dataf["val2"].mean()
|
|
|
|
result = df1.groupby("val1", squeeze=True).apply(func)
|
|
assert isinstance(result, Series)
|
|
|
|
df2 = DataFrame(
|
|
[{"val1": 1, "val2": 20},
|
|
{"val1": 1, "val2": 19},
|
|
{"val1": 1, "val2": 27},
|
|
{"val1": 1, "val2": 12}
|
|
])
|
|
|
|
def func(dataf):
|
|
return dataf["val2"] - dataf["val2"].mean()
|
|
|
|
result = df2.groupby("val1", squeeze=True).apply(func)
|
|
assert isinstance(result, Series)
|
|
|
|
# GH3596, return a consistent type (regression in 0.11 from 0.10.1)
|
|
df = DataFrame([[1, 1], [1, 1]], columns=['X', 'Y'])
|
|
result = df.groupby('X', squeeze=False).count()
|
|
assert isinstance(result, DataFrame)
|
|
|
|
# GH5592
|
|
# inconcistent return type
|
|
df = DataFrame(dict(A=['Tiger', 'Tiger', 'Tiger', 'Lamb', 'Lamb',
|
|
'Pony', 'Pony'], B=Series(
|
|
np.arange(7), dtype='int64'), C=date_range(
|
|
'20130101', periods=7)))
|
|
|
|
def f(grp):
|
|
return grp.iloc[0]
|
|
|
|
expected = df.groupby('A').first()[['B']]
|
|
result = df.groupby('A').apply(f)[['B']]
|
|
assert_frame_equal(result, expected)
|
|
|
|
def f(grp):
|
|
if grp.name == 'Tiger':
|
|
return None
|
|
return grp.iloc[0]
|
|
|
|
result = df.groupby('A').apply(f)[['B']]
|
|
e = expected.copy()
|
|
e.loc['Tiger'] = np.nan
|
|
assert_frame_equal(result, e)
|
|
|
|
def f(grp):
|
|
if grp.name == 'Pony':
|
|
return None
|
|
return grp.iloc[0]
|
|
|
|
result = df.groupby('A').apply(f)[['B']]
|
|
e = expected.copy()
|
|
e.loc['Pony'] = np.nan
|
|
assert_frame_equal(result, e)
|
|
|
|
# 5592 revisited, with datetimes
|
|
def f(grp):
|
|
if grp.name == 'Pony':
|
|
return None
|
|
return grp.iloc[0]
|
|
|
|
result = df.groupby('A').apply(f)[['C']]
|
|
e = df.groupby('A').first()[['C']]
|
|
e.loc['Pony'] = pd.NaT
|
|
assert_frame_equal(result, e)
|
|
|
|
# scalar outputs
|
|
def f(grp):
|
|
if grp.name == 'Pony':
|
|
return None
|
|
return grp.iloc[0].loc['C']
|
|
|
|
result = df.groupby('A').apply(f)
|
|
e = df.groupby('A').first()['C'].copy()
|
|
e.loc['Pony'] = np.nan
|
|
e.name = None
|
|
assert_series_equal(result, e)
|
|
|
|
|
|
def test_pass_args_kwargs(ts, tsframe):
|
|
|
|
def f(x, q=None, axis=0):
|
|
return np.percentile(x, q, axis=axis)
|
|
|
|
g = lambda x: np.percentile(x, 80, axis=0)
|
|
|
|
# Series
|
|
ts_grouped = ts.groupby(lambda x: x.month)
|
|
agg_result = ts_grouped.agg(np.percentile, 80, axis=0)
|
|
apply_result = ts_grouped.apply(np.percentile, 80, axis=0)
|
|
trans_result = ts_grouped.transform(np.percentile, 80, axis=0)
|
|
|
|
agg_expected = ts_grouped.quantile(.8)
|
|
trans_expected = ts_grouped.transform(g)
|
|
|
|
assert_series_equal(apply_result, agg_expected)
|
|
assert_series_equal(agg_result, agg_expected, check_names=False)
|
|
assert_series_equal(trans_result, trans_expected)
|
|
|
|
agg_result = ts_grouped.agg(f, q=80)
|
|
apply_result = ts_grouped.apply(f, q=80)
|
|
trans_result = ts_grouped.transform(f, q=80)
|
|
assert_series_equal(agg_result, agg_expected)
|
|
assert_series_equal(apply_result, agg_expected)
|
|
assert_series_equal(trans_result, trans_expected)
|
|
|
|
# DataFrame
|
|
df_grouped = tsframe.groupby(lambda x: x.month)
|
|
agg_result = df_grouped.agg(np.percentile, 80, axis=0)
|
|
apply_result = df_grouped.apply(DataFrame.quantile, .8)
|
|
expected = df_grouped.quantile(.8)
|
|
assert_frame_equal(apply_result, expected)
|
|
assert_frame_equal(agg_result, expected, check_names=False)
|
|
|
|
agg_result = df_grouped.agg(f, q=80)
|
|
apply_result = df_grouped.apply(DataFrame.quantile, q=.8)
|
|
assert_frame_equal(agg_result, expected, check_names=False)
|
|
assert_frame_equal(apply_result, expected)
|
|
|
|
|
|
def test_len():
|
|
df = tm.makeTimeDataFrame()
|
|
grouped = df.groupby([lambda x: x.year, lambda x: x.month,
|
|
lambda x: x.day])
|
|
assert len(grouped) == len(df)
|
|
|
|
grouped = df.groupby([lambda x: x.year, lambda x: x.month])
|
|
expected = len({(x.year, x.month) for x in df.index})
|
|
assert len(grouped) == expected
|
|
|
|
# issue 11016
|
|
df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
|
|
assert len(df.groupby(('a'))) == 0
|
|
assert len(df.groupby(('b'))) == 3
|
|
assert len(df.groupby(['a', 'b'])) == 3
|
|
|
|
|
|
def test_basic_regression():
|
|
# regression
|
|
T = [1.0 * x for x in lrange(1, 10) * 10][:1095]
|
|
result = Series(T, lrange(0, len(T)))
|
|
|
|
groupings = np.random.random((1100, ))
|
|
groupings = Series(groupings, lrange(0, len(groupings))) * 10.
|
|
|
|
grouped = result.groupby(groupings)
|
|
grouped.mean()
|
|
|
|
|
|
@pytest.mark.parametrize('dtype', ['float64', 'float32', 'int64',
|
|
'int32', 'int16', 'int8'])
|
|
def test_with_na_groups(dtype):
|
|
index = Index(np.arange(10))
|
|
values = Series(np.ones(10), index, dtype=dtype)
|
|
labels = Series([np.nan, 'foo', 'bar', 'bar', np.nan, np.nan,
|
|
'bar', 'bar', np.nan, 'foo'], index=index)
|
|
|
|
# this SHOULD be an int
|
|
grouped = values.groupby(labels)
|
|
agged = grouped.agg(len)
|
|
expected = Series([4, 2], index=['bar', 'foo'])
|
|
|
|
assert_series_equal(agged, expected, check_dtype=False)
|
|
|
|
# assert issubclass(agged.dtype.type, np.integer)
|
|
|
|
# explicitly return a float from my function
|
|
def f(x):
|
|
return float(len(x))
|
|
|
|
agged = grouped.agg(f)
|
|
expected = Series([4, 2], index=['bar', 'foo'])
|
|
|
|
assert_series_equal(agged, expected, check_dtype=False)
|
|
assert issubclass(agged.dtype.type, np.dtype(dtype).type)
|
|
|
|
|
|
def test_indices_concatenation_order():
|
|
|
|
# GH 2808
|
|
|
|
def f1(x):
|
|
y = x[(x.b % 2) == 1] ** 2
|
|
if y.empty:
|
|
multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2,
|
|
names=['b', 'c'])
|
|
res = DataFrame(None, columns=['a'], index=multiindex)
|
|
return res
|
|
else:
|
|
y = y.set_index(['b', 'c'])
|
|
return y
|
|
|
|
def f2(x):
|
|
y = x[(x.b % 2) == 1] ** 2
|
|
if y.empty:
|
|
return DataFrame()
|
|
else:
|
|
y = y.set_index(['b', 'c'])
|
|
return y
|
|
|
|
def f3(x):
|
|
y = x[(x.b % 2) == 1] ** 2
|
|
if y.empty:
|
|
multiindex = MultiIndex(levels=[[]] * 2, labels=[[]] * 2,
|
|
names=['foo', 'bar'])
|
|
res = DataFrame(None, columns=['a', 'b'], index=multiindex)
|
|
return res
|
|
else:
|
|
return y
|
|
|
|
df = DataFrame({'a': [1, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
|
|
|
|
df2 = DataFrame({'a': [3, 2, 2, 2], 'b': lrange(4), 'c': lrange(5, 9)})
|
|
|
|
# correct result
|
|
result1 = df.groupby('a').apply(f1)
|
|
result2 = df2.groupby('a').apply(f1)
|
|
assert_frame_equal(result1, result2)
|
|
|
|
# should fail (not the same number of levels)
|
|
pytest.raises(AssertionError, df.groupby('a').apply, f2)
|
|
pytest.raises(AssertionError, df2.groupby('a').apply, f2)
|
|
|
|
# should fail (incorrect shape)
|
|
pytest.raises(AssertionError, df.groupby('a').apply, f3)
|
|
pytest.raises(AssertionError, df2.groupby('a').apply, f3)
|
|
|
|
|
|
def test_attr_wrapper(ts):
|
|
grouped = ts.groupby(lambda x: x.weekday())
|
|
|
|
result = grouped.std()
|
|
expected = grouped.agg(lambda x: np.std(x, ddof=1))
|
|
assert_series_equal(result, expected)
|
|
|
|
# this is pretty cool
|
|
result = grouped.describe()
|
|
expected = {}
|
|
for name, gp in grouped:
|
|
expected[name] = gp.describe()
|
|
expected = DataFrame(expected).T
|
|
assert_frame_equal(result, expected)
|
|
|
|
# get attribute
|
|
result = grouped.dtype
|
|
expected = grouped.agg(lambda x: x.dtype)
|
|
|
|
# make sure raises error
|
|
pytest.raises(AttributeError, getattr, grouped, 'foo')
|
|
|
|
|
|
def test_frame_groupby(tsframe):
|
|
grouped = tsframe.groupby(lambda x: x.weekday())
|
|
|
|
# aggregate
|
|
aggregated = grouped.aggregate(np.mean)
|
|
assert len(aggregated) == 5
|
|
assert len(aggregated.columns) == 4
|
|
|
|
# by string
|
|
tscopy = tsframe.copy()
|
|
tscopy['weekday'] = [x.weekday() for x in tscopy.index]
|
|
stragged = tscopy.groupby('weekday').aggregate(np.mean)
|
|
assert_frame_equal(stragged, aggregated, check_names=False)
|
|
|
|
# transform
|
|
grouped = tsframe.head(30).groupby(lambda x: x.weekday())
|
|
transformed = grouped.transform(lambda x: x - x.mean())
|
|
assert len(transformed) == 30
|
|
assert len(transformed.columns) == 4
|
|
|
|
# transform propagate
|
|
transformed = grouped.transform(lambda x: x.mean())
|
|
for name, group in grouped:
|
|
mean = group.mean()
|
|
for idx in group.index:
|
|
tm.assert_series_equal(transformed.xs(idx), mean,
|
|
check_names=False)
|
|
|
|
# iterate
|
|
for weekday, group in grouped:
|
|
assert group.index[0].weekday() == weekday
|
|
|
|
# groups / group_indices
|
|
groups = grouped.groups
|
|
indices = grouped.indices
|
|
|
|
for k, v in compat.iteritems(groups):
|
|
samething = tsframe.index.take(indices[k])
|
|
assert (samething == v).all()
|
|
|
|
|
|
def test_frame_groupby_columns(tsframe):
|
|
mapping = {'A': 0, 'B': 0, 'C': 1, 'D': 1}
|
|
grouped = tsframe.groupby(mapping, axis=1)
|
|
|
|
# aggregate
|
|
aggregated = grouped.aggregate(np.mean)
|
|
assert len(aggregated) == len(tsframe)
|
|
assert len(aggregated.columns) == 2
|
|
|
|
# transform
|
|
tf = lambda x: x - x.mean()
|
|
groupedT = tsframe.T.groupby(mapping, axis=0)
|
|
assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
|
|
|
|
# iterate
|
|
for k, v in grouped:
|
|
assert len(v.columns) == 2
|
|
|
|
|
|
def test_frame_set_name_single(df):
|
|
grouped = df.groupby('A')
|
|
|
|
result = grouped.mean()
|
|
assert result.index.name == 'A'
|
|
|
|
result = df.groupby('A', as_index=False).mean()
|
|
assert result.index.name != 'A'
|
|
|
|
result = grouped.agg(np.mean)
|
|
assert result.index.name == 'A'
|
|
|
|
result = grouped.agg({'C': np.mean, 'D': np.std})
|
|
assert result.index.name == 'A'
|
|
|
|
result = grouped['C'].mean()
|
|
assert result.index.name == 'A'
|
|
result = grouped['C'].agg(np.mean)
|
|
assert result.index.name == 'A'
|
|
result = grouped['C'].agg([np.mean, np.std])
|
|
assert result.index.name == 'A'
|
|
|
|
with tm.assert_produces_warning(FutureWarning,
|
|
check_stacklevel=False):
|
|
result = grouped['C'].agg({'foo': np.mean, 'bar': np.std})
|
|
assert result.index.name == 'A'
|
|
|
|
|
|
def test_multi_func(df):
|
|
col1 = df['A']
|
|
col2 = df['B']
|
|
|
|
grouped = df.groupby([col1.get, col2.get])
|
|
agged = grouped.mean()
|
|
expected = df.groupby(['A', 'B']).mean()
|
|
|
|
# TODO groupby get drops names
|
|
assert_frame_equal(agged.loc[:, ['C', 'D']],
|
|
expected.loc[:, ['C', 'D']],
|
|
check_names=False)
|
|
|
|
# some "groups" with no data
|
|
df = DataFrame({'v1': np.random.randn(6),
|
|
'v2': np.random.randn(6),
|
|
'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
|
|
'k2': np.array(['1', '1', '1', '2', '2', '2'])},
|
|
index=['one', 'two', 'three', 'four', 'five', 'six'])
|
|
# only verify that it works for now
|
|
grouped = df.groupby(['k1', 'k2'])
|
|
grouped.agg(np.sum)
|
|
|
|
|
|
def test_multi_key_multiple_functions(df):
|
|
grouped = df.groupby(['A', 'B'])['C']
|
|
|
|
agged = grouped.agg([np.mean, np.std])
|
|
expected = DataFrame({'mean': grouped.agg(np.mean),
|
|
'std': grouped.agg(np.std)})
|
|
assert_frame_equal(agged, expected)
|
|
|
|
|
|
def test_frame_multi_key_function_list():
|
|
data = DataFrame(
|
|
{'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar',
|
|
'foo', 'foo', 'foo'],
|
|
'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two',
|
|
'two', 'two', 'one'],
|
|
'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny',
|
|
'dull', 'shiny', 'shiny', 'shiny'],
|
|
'D': np.random.randn(11),
|
|
'E': np.random.randn(11),
|
|
'F': np.random.randn(11)})
|
|
|
|
grouped = data.groupby(['A', 'B'])
|
|
funcs = [np.mean, np.std]
|
|
agged = grouped.agg(funcs)
|
|
expected = pd.concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs),
|
|
grouped['F'].agg(funcs)],
|
|
keys=['D', 'E', 'F'], axis=1)
|
|
assert (isinstance(agged.index, MultiIndex))
|
|
assert (isinstance(expected.index, MultiIndex))
|
|
assert_frame_equal(agged, expected)
|
|
|
|
|
|
@pytest.mark.parametrize('op', [lambda x: x.sum(), lambda x: x.mean()])
|
|
def test_groupby_multiple_columns(df, op):
|
|
data = df
|
|
grouped = data.groupby(['A', 'B'])
|
|
|
|
with catch_warnings(record=True):
|
|
result1 = op(grouped)
|
|
|
|
expected = defaultdict(dict)
|
|
for n1, gp1 in data.groupby('A'):
|
|
for n2, gp2 in gp1.groupby('B'):
|
|
expected[n1][n2] = op(gp2.loc[:, ['C', 'D']])
|
|
expected = dict((k, DataFrame(v))
|
|
for k, v in compat.iteritems(expected))
|
|
expected = Panel.fromDict(expected).swapaxes(0, 1)
|
|
expected.major_axis.name, expected.minor_axis.name = 'A', 'B'
|
|
|
|
# a little bit crude
|
|
for col in ['C', 'D']:
|
|
result_col = op(grouped[col])
|
|
exp = expected[col]
|
|
pivoted = result1[col].unstack()
|
|
pivoted2 = result_col.unstack()
|
|
assert_frame_equal(pivoted.reindex_like(exp), exp)
|
|
assert_frame_equal(pivoted2.reindex_like(exp), exp)
|
|
|
|
# test single series works the same
|
|
result = data['C'].groupby([data['A'], data['B']]).mean()
|
|
expected = data.groupby(['A', 'B']).mean()['C']
|
|
|
|
assert_series_equal(result, expected)
|
|
|
|
|
|
def test_groupby_as_index_agg(df):
|
|
grouped = df.groupby('A', as_index=False)
|
|
|
|
# single-key
|
|
|
|
result = grouped.agg(np.mean)
|
|
expected = grouped.mean()
|
|
assert_frame_equal(result, expected)
|
|
|
|
result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
|
|
expected2 = grouped.mean()
|
|
expected2['D'] = grouped.sum()['D']
|
|
assert_frame_equal(result2, expected2)
|
|
|
|
grouped = df.groupby('A', as_index=True)
|
|
expected3 = grouped['C'].sum()
|
|
expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
|
|
|
|
with tm.assert_produces_warning(FutureWarning,
|
|
check_stacklevel=False):
|
|
result3 = grouped['C'].agg({'Q': np.sum})
|
|
assert_frame_equal(result3, expected3)
|
|
|
|
# multi-key
|
|
|
|
grouped = df.groupby(['A', 'B'], as_index=False)
|
|
|
|
result = grouped.agg(np.mean)
|
|
expected = grouped.mean()
|
|
assert_frame_equal(result, expected)
|
|
|
|
result2 = grouped.agg(OrderedDict([['C', np.mean], ['D', np.sum]]))
|
|
expected2 = grouped.mean()
|
|
expected2['D'] = grouped.sum()['D']
|
|
assert_frame_equal(result2, expected2)
|
|
|
|
expected3 = grouped['C'].sum()
|
|
expected3 = DataFrame(expected3).rename(columns={'C': 'Q'})
|
|
result3 = grouped['C'].agg({'Q': np.sum})
|
|
assert_frame_equal(result3, expected3)
|
|
|
|
# GH7115 & GH8112 & GH8582
|
|
df = DataFrame(np.random.randint(0, 100, (50, 3)),
|
|
columns=['jim', 'joe', 'jolie'])
|
|
ts = Series(np.random.randint(5, 10, 50), name='jim')
|
|
|
|
gr = df.groupby(ts)
|
|
gr.nth(0) # invokes set_selection_from_grouper internally
|
|
assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))
|
|
|
|
for attr in ['mean', 'max', 'count', 'idxmax', 'cumsum', 'all']:
|
|
gr = df.groupby(ts, as_index=False)
|
|
left = getattr(gr, attr)()
|
|
|
|
gr = df.groupby(ts.values, as_index=True)
|
|
right = getattr(gr, attr)().reset_index(drop=True)
|
|
|
|
assert_frame_equal(left, right)
|
|
|
|
|
|
def test_as_index_series_return_frame(df):
|
|
grouped = df.groupby('A', as_index=False)
|
|
grouped2 = df.groupby(['A', 'B'], as_index=False)
|
|
|
|
result = grouped['C'].agg(np.sum)
|
|
expected = grouped.agg(np.sum).loc[:, ['A', 'C']]
|
|
assert isinstance(result, DataFrame)
|
|
assert_frame_equal(result, expected)
|
|
|
|
result2 = grouped2['C'].agg(np.sum)
|
|
expected2 = grouped2.agg(np.sum).loc[:, ['A', 'B', 'C']]
|
|
assert isinstance(result2, DataFrame)
|
|
assert_frame_equal(result2, expected2)
|
|
|
|
result = grouped['C'].sum()
|
|
expected = grouped.sum().loc[:, ['A', 'C']]
|
|
assert isinstance(result, DataFrame)
|
|
assert_frame_equal(result, expected)
|
|
|
|
result2 = grouped2['C'].sum()
|
|
expected2 = grouped2.sum().loc[:, ['A', 'B', 'C']]
|
|
assert isinstance(result2, DataFrame)
|
|
assert_frame_equal(result2, expected2)
|
|
|
|
# corner case
|
|
pytest.raises(Exception, grouped['C'].__getitem__, 'D')
|
|
|
|
|
|
def test_groupby_as_index_cython(df):
|
|
data = df
|
|
|
|
# single-key
|
|
grouped = data.groupby('A', as_index=False)
|
|
result = grouped.mean()
|
|
expected = data.groupby(['A']).mean()
|
|
expected.insert(0, 'A', expected.index)
|
|
expected.index = np.arange(len(expected))
|
|
assert_frame_equal(result, expected)
|
|
|
|
# multi-key
|
|
grouped = data.groupby(['A', 'B'], as_index=False)
|
|
result = grouped.mean()
|
|
expected = data.groupby(['A', 'B']).mean()
|
|
|
|
arrays = lzip(*expected.index.values)
|
|
expected.insert(0, 'A', arrays[0])
|
|
expected.insert(1, 'B', arrays[1])
|
|
expected.index = np.arange(len(expected))
|
|
assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_groupby_as_index_series_scalar(df):
|
|
grouped = df.groupby(['A', 'B'], as_index=False)
|
|
|
|
# GH #421
|
|
|
|
result = grouped['C'].agg(len)
|
|
expected = grouped.agg(len).loc[:, ['A', 'B', 'C']]
|
|
assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_groupby_as_index_corner(df, ts):
|
|
pytest.raises(TypeError, ts.groupby, lambda x: x.weekday(),
|
|
as_index=False)
|
|
|
|
pytest.raises(ValueError, df.groupby, lambda x: x.lower(),
|
|
as_index=False, axis=1)
|
|
|
|
|
|
def test_groupby_multiple_key(df):
|
|
df = tm.makeTimeDataFrame()
|
|
grouped = df.groupby([lambda x: x.year, lambda x: x.month,
|
|
lambda x: x.day])
|
|
agged = grouped.sum()
|
|
assert_almost_equal(df.values, agged.values)
|
|
|
|
grouped = df.T.groupby([lambda x: x.year,
|
|
lambda x: x.month,
|
|
lambda x: x.day], axis=1)
|
|
|
|
agged = grouped.agg(lambda x: x.sum())
|
|
tm.assert_index_equal(agged.index, df.columns)
|
|
assert_almost_equal(df.T.values, agged.values)
|
|
|
|
agged = grouped.agg(lambda x: x.sum())
|
|
assert_almost_equal(df.T.values, agged.values)
|
|
|
|
|
|
def test_groupby_multi_corner(df):
|
|
# test that having an all-NA column doesn't mess you up
|
|
df = df.copy()
|
|
df['bad'] = np.nan
|
|
agged = df.groupby(['A', 'B']).mean()
|
|
|
|
expected = df.groupby(['A', 'B']).mean()
|
|
expected['bad'] = np.nan
|
|
|
|
assert_frame_equal(agged, expected)
|
|
|
|
|
|
def test_omit_nuisance(df):
|
|
grouped = df.groupby('A')
|
|
|
|
result = grouped.mean()
|
|
expected = df.loc[:, ['A', 'C', 'D']].groupby('A').mean()
|
|
assert_frame_equal(result, expected)
|
|
|
|
agged = grouped.agg(np.mean)
|
|
exp = grouped.mean()
|
|
assert_frame_equal(agged, exp)
|
|
|
|
df = df.loc[:, ['A', 'C', 'D']]
|
|
df['E'] = datetime.now()
|
|
grouped = df.groupby('A')
|
|
result = grouped.agg(np.sum)
|
|
expected = grouped.sum()
|
|
assert_frame_equal(result, expected)
|
|
|
|
# won't work with axis = 1
|
|
grouped = df.groupby({'A': 0, 'C': 0, 'D': 1, 'E': 1}, axis=1)
|
|
result = pytest.raises(TypeError, grouped.agg,
|
|
lambda x: x.sum(0, numeric_only=False))
|
|
|
|
|
|
def test_omit_nuisance_python_multiple(three_group):
|
|
grouped = three_group.groupby(['A', 'B'])
|
|
|
|
agged = grouped.agg(np.mean)
|
|
exp = grouped.mean()
|
|
assert_frame_equal(agged, exp)
|
|
|
|
|
|
def test_empty_groups_corner(mframe):
|
|
# handle empty groups
|
|
df = DataFrame({'k1': np.array(['b', 'b', 'b', 'a', 'a', 'a']),
|
|
'k2': np.array(['1', '1', '1', '2', '2', '2']),
|
|
'k3': ['foo', 'bar'] * 3,
|
|
'v1': np.random.randn(6),
|
|
'v2': np.random.randn(6)})
|
|
|
|
grouped = df.groupby(['k1', 'k2'])
|
|
result = grouped.agg(np.mean)
|
|
expected = grouped.mean()
|
|
assert_frame_equal(result, expected)
|
|
|
|
grouped = mframe[3:5].groupby(level=0)
|
|
agged = grouped.apply(lambda x: x.mean())
|
|
agged_A = grouped['A'].apply(np.mean)
|
|
assert_series_equal(agged['A'], agged_A)
|
|
assert agged.index.name == 'first'
|
|
|
|
|
|
def test_nonsense_func():
|
|
df = DataFrame([0])
|
|
pytest.raises(Exception, df.groupby, lambda x: x + 'foo')
|
|
|
|
|
|
def test_wrap_aggregated_output_multindex(mframe):
|
|
df = mframe.T
|
|
df['baz', 'two'] = 'peekaboo'
|
|
|
|
keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
|
|
agged = df.groupby(keys).agg(np.mean)
|
|
assert isinstance(agged.columns, MultiIndex)
|
|
|
|
def aggfun(ser):
|
|
if ser.name == ('foo', 'one'):
|
|
raise TypeError
|
|
else:
|
|
return ser.sum()
|
|
|
|
agged2 = df.groupby(keys).aggregate(aggfun)
|
|
assert len(agged2.columns) + 1 == len(df.columns)
|
|
|
|
|
|
def test_groupby_level_apply(mframe):
|
|
|
|
result = mframe.groupby(level=0).count()
|
|
assert result.index.name == 'first'
|
|
result = mframe.groupby(level=1).count()
|
|
assert result.index.name == 'second'
|
|
|
|
result = mframe['A'].groupby(level=0).count()
|
|
assert result.index.name == 'first'
|
|
|
|
|
|
def test_groupby_level_mapper(mframe):
|
|
deleveled = mframe.reset_index()
|
|
|
|
mapper0 = {'foo': 0, 'bar': 0, 'baz': 1, 'qux': 1}
|
|
mapper1 = {'one': 0, 'two': 0, 'three': 1}
|
|
|
|
result0 = mframe.groupby(mapper0, level=0).sum()
|
|
result1 = mframe.groupby(mapper1, level=1).sum()
|
|
|
|
mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']])
|
|
mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']])
|
|
expected0 = mframe.groupby(mapped_level0).sum()
|
|
expected1 = mframe.groupby(mapped_level1).sum()
|
|
expected0.index.name, expected1.index.name = 'first', 'second'
|
|
|
|
assert_frame_equal(result0, expected0)
|
|
assert_frame_equal(result1, expected1)
|
|
|
|
|
|
def test_groupby_level_nonmulti():
|
|
# GH 1313, GH 13901
|
|
s = Series([1, 2, 3, 10, 4, 5, 20, 6],
|
|
Index([1, 2, 3, 1, 4, 5, 2, 6], name='foo'))
|
|
expected = Series([11, 22, 3, 4, 5, 6],
|
|
Index(range(1, 7), name='foo'))
|
|
|
|
result = s.groupby(level=0).sum()
|
|
tm.assert_series_equal(result, expected)
|
|
result = s.groupby(level=[0]).sum()
|
|
tm.assert_series_equal(result, expected)
|
|
result = s.groupby(level=-1).sum()
|
|
tm.assert_series_equal(result, expected)
|
|
result = s.groupby(level=[-1]).sum()
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
pytest.raises(ValueError, s.groupby, level=1)
|
|
pytest.raises(ValueError, s.groupby, level=-2)
|
|
pytest.raises(ValueError, s.groupby, level=[])
|
|
pytest.raises(ValueError, s.groupby, level=[0, 0])
|
|
pytest.raises(ValueError, s.groupby, level=[0, 1])
|
|
pytest.raises(ValueError, s.groupby, level=[1])
|
|
|
|
|
|
def test_groupby_complex():
|
|
# GH 12902
|
|
a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1])
|
|
expected = Series((1 + 2j, 5 + 10j))
|
|
|
|
result = a.groupby(level=0).sum()
|
|
assert_series_equal(result, expected)
|
|
|
|
result = a.sum(level=0)
|
|
assert_series_equal(result, expected)
|
|
|
|
|
|
def test_mutate_groups():
|
|
|
|
# GH3380
|
|
|
|
df = DataFrame({
|
|
'cat1': ['a'] * 8 + ['b'] * 6,
|
|
'cat2': ['c'] * 2 + ['d'] * 2 + ['e'] * 2 + ['f'] * 2 + ['c'] * 2 +
|
|
['d'] * 2 + ['e'] * 2,
|
|
'cat3': lmap(lambda x: 'g%s' % x, lrange(1, 15)),
|
|
'val': np.random.randint(100, size=14),
|
|
})
|
|
|
|
def f_copy(x):
|
|
x = x.copy()
|
|
x['rank'] = x.val.rank(method='min')
|
|
return x.groupby('cat2')['rank'].min()
|
|
|
|
def f_no_copy(x):
|
|
x['rank'] = x.val.rank(method='min')
|
|
return x.groupby('cat2')['rank'].min()
|
|
|
|
grpby_copy = df.groupby('cat1').apply(f_copy)
|
|
grpby_no_copy = df.groupby('cat1').apply(f_no_copy)
|
|
assert_series_equal(grpby_copy, grpby_no_copy)
|
|
|
|
|
|
def test_no_mutate_but_looks_like():
|
|
|
|
# GH 8467
|
|
# first show's mutation indicator
|
|
# second does not, but should yield the same results
|
|
df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3], 'value': range(9)})
|
|
|
|
result1 = df.groupby('key', group_keys=True).apply(lambda x: x[:].key)
|
|
result2 = df.groupby('key', group_keys=True).apply(lambda x: x.key)
|
|
assert_series_equal(result1, result2)
|
|
|
|
|
|
def test_groupby_series_indexed_differently():
|
|
s1 = Series([5.0, -9.0, 4.0, 100., -5., 55., 6.7],
|
|
index=Index(['a', 'b', 'c', 'd', 'e', 'f', 'g']))
|
|
s2 = Series([1.0, 1.0, 4.0, 5.0, 5.0, 7.0],
|
|
index=Index(['a', 'b', 'd', 'f', 'g', 'h']))
|
|
|
|
grouped = s1.groupby(s2)
|
|
agged = grouped.mean()
|
|
exp = s1.groupby(s2.reindex(s1.index).get).mean()
|
|
assert_series_equal(agged, exp)
|
|
|
|
|
|
def test_groupby_with_hier_columns():
|
|
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux',
|
|
'qux'], ['one', 'two', 'one', 'two', 'one', 'two',
|
|
'one', 'two']]))
|
|
index = MultiIndex.from_tuples(tuples)
|
|
columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), (
|
|
'B', 'cat'), ('A', 'dog')])
|
|
df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
|
|
|
|
result = df.groupby(level=0).mean()
|
|
tm.assert_index_equal(result.columns, columns)
|
|
|
|
result = df.groupby(level=0, axis=1).mean()
|
|
tm.assert_index_equal(result.index, df.index)
|
|
|
|
result = df.groupby(level=0).agg(np.mean)
|
|
tm.assert_index_equal(result.columns, columns)
|
|
|
|
result = df.groupby(level=0).apply(lambda x: x.mean())
|
|
tm.assert_index_equal(result.columns, columns)
|
|
|
|
result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
|
|
tm.assert_index_equal(result.columns, Index(['A', 'B']))
|
|
tm.assert_index_equal(result.index, df.index)
|
|
|
|
# add a nuisance column
|
|
sorted_columns, _ = columns.sortlevel(0)
|
|
df['A', 'foo'] = 'bar'
|
|
result = df.groupby(level=0).mean()
|
|
tm.assert_index_equal(result.columns, df.columns[:-1])
|
|
|
|
|
|
def test_grouping_ndarray(df):
|
|
grouped = df.groupby(df['A'].values)
|
|
|
|
result = grouped.sum()
|
|
expected = df.groupby('A').sum()
|
|
assert_frame_equal(result, expected, check_names=False
|
|
) # Note: no names when grouping by value
|
|
|
|
|
|
def test_groupby_wrong_multi_labels():
|
|
data = """index,foo,bar,baz,spam,data
|
|
0,foo1,bar1,baz1,spam2,20
|
|
1,foo1,bar2,baz1,spam3,30
|
|
2,foo2,bar2,baz1,spam2,40
|
|
3,foo1,bar1,baz2,spam1,50
|
|
4,foo3,bar1,baz2,spam1,60"""
|
|
|
|
data = read_csv(StringIO(data), index_col=0)
|
|
|
|
grouped = data.groupby(['foo', 'bar', 'baz', 'spam'])
|
|
|
|
result = grouped.agg(np.mean)
|
|
expected = grouped.mean()
|
|
assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_groupby_series_with_name(df):
|
|
result = df.groupby(df['A']).mean()
|
|
result2 = df.groupby(df['A'], as_index=False).mean()
|
|
assert result.index.name == 'A'
|
|
assert 'A' in result2
|
|
|
|
result = df.groupby([df['A'], df['B']]).mean()
|
|
result2 = df.groupby([df['A'], df['B']],
|
|
as_index=False).mean()
|
|
assert result.index.names == ('A', 'B')
|
|
assert 'A' in result2
|
|
assert 'B' in result2
|
|
|
|
|
|
def test_seriesgroupby_name_attr(df):
|
|
# GH 6265
|
|
result = df.groupby('A')['C']
|
|
assert result.count().name == 'C'
|
|
assert result.mean().name == 'C'
|
|
|
|
testFunc = lambda x: np.sum(x) * 2
|
|
assert result.agg(testFunc).name == 'C'
|
|
|
|
|
|
def test_consistency_name():
|
|
# GH 12363
|
|
|
|
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
|
|
'foo', 'bar', 'foo', 'foo'],
|
|
'B': ['one', 'one', 'two', 'two',
|
|
'two', 'two', 'one', 'two'],
|
|
'C': np.random.randn(8) + 1.0,
|
|
'D': np.arange(8)})
|
|
|
|
expected = df.groupby(['A']).B.count()
|
|
result = df.B.groupby(df.A).count()
|
|
assert_series_equal(result, expected)
|
|
|
|
|
|
def test_groupby_name_propagation(df):
|
|
# GH 6124
|
|
def summarize(df, name=None):
|
|
return Series({'count': 1, 'mean': 2, 'omissions': 3, }, name=name)
|
|
|
|
def summarize_random_name(df):
|
|
# Provide a different name for each Series. In this case, groupby
|
|
# should not attempt to propagate the Series name since they are
|
|
# inconsistent.
|
|
return Series({
|
|
'count': 1,
|
|
'mean': 2,
|
|
'omissions': 3,
|
|
}, name=df.iloc[0]['A'])
|
|
|
|
metrics = df.groupby('A').apply(summarize)
|
|
assert metrics.columns.name is None
|
|
metrics = df.groupby('A').apply(summarize, 'metrics')
|
|
assert metrics.columns.name == 'metrics'
|
|
metrics = df.groupby('A').apply(summarize_random_name)
|
|
assert metrics.columns.name is None
|
|
|
|
|
|
def test_groupby_nonstring_columns():
|
|
df = DataFrame([np.arange(10) for x in range(10)])
|
|
grouped = df.groupby(0)
|
|
result = grouped.mean()
|
|
expected = df.groupby(df[0]).mean()
|
|
assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_groupby_mixed_type_columns():
|
|
# GH 13432, unorderable types in py3
|
|
df = DataFrame([[0, 1, 2]], columns=['A', 'B', 0])
|
|
expected = DataFrame([[1, 2]], columns=['B', 0],
|
|
index=Index([0], name='A'))
|
|
|
|
result = df.groupby('A').first()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.groupby('A').sum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_cython_grouper_series_bug_noncontig():
|
|
arr = np.empty((100, 100))
|
|
arr.fill(np.nan)
|
|
obj = Series(arr[:, 0], index=lrange(100))
|
|
inds = np.tile(lrange(10), 10)
|
|
|
|
result = obj.groupby(inds).agg(Series.median)
|
|
assert result.isna().all()
|
|
|
|
|
|
def test_series_grouper_noncontig_index():
|
|
index = Index(tm.rands_array(10, 100))
|
|
|
|
values = Series(np.random.randn(50), index=index[::2])
|
|
labels = np.random.randint(0, 5, 50)
|
|
|
|
# it works!
|
|
grouped = values.groupby(labels)
|
|
|
|
# accessing the index elements causes segfault
|
|
f = lambda x: len(set(map(id, x.index)))
|
|
grouped.agg(f)
|
|
|
|
|
|
def test_convert_objects_leave_decimal_alone():
|
|
|
|
s = Series(lrange(5))
|
|
labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O')
|
|
|
|
def convert_fast(x):
|
|
return Decimal(str(x.mean()))
|
|
|
|
def convert_force_pure(x):
|
|
# base will be length 0
|
|
assert (len(x.values.base) > 0)
|
|
return Decimal(str(x.mean()))
|
|
|
|
grouped = s.groupby(labels)
|
|
|
|
result = grouped.agg(convert_fast)
|
|
assert result.dtype == np.object_
|
|
assert isinstance(result[0], Decimal)
|
|
|
|
result = grouped.agg(convert_force_pure)
|
|
assert result.dtype == np.object_
|
|
assert isinstance(result[0], Decimal)
|
|
|
|
|
|
def test_groupby_dtype_inference_empty():
|
|
# GH 6733
|
|
df = DataFrame({'x': [], 'range': np.arange(0, dtype='int64')})
|
|
assert df['x'].dtype == np.float64
|
|
|
|
result = df.groupby('x').first()
|
|
exp_index = Index([], name='x', dtype=np.float64)
|
|
expected = DataFrame({'range': Series(
|
|
[], index=exp_index, dtype='int64')})
|
|
assert_frame_equal(result, expected, by_blocks=True)
|
|
|
|
|
|
def test_groupby_list_infer_array_like(df):
|
|
result = df.groupby(list(df['A'])).mean()
|
|
expected = df.groupby(df['A']).mean()
|
|
assert_frame_equal(result, expected, check_names=False)
|
|
|
|
pytest.raises(Exception, df.groupby, list(df['A'][:-1]))
|
|
|
|
# pathological case of ambiguity
|
|
df = DataFrame({'foo': [0, 1],
|
|
'bar': [3, 4],
|
|
'val': np.random.randn(2)})
|
|
|
|
result = df.groupby(['foo', 'bar']).mean()
|
|
expected = df.groupby([df['foo'], df['bar']]).mean()[['val']]
|
|
|
|
|
|
def test_groupby_keys_same_size_as_index():
|
|
# GH 11185
|
|
freq = 's'
|
|
index = pd.date_range(start=pd.Timestamp('2015-09-29T11:34:44-0700'),
|
|
periods=2, freq=freq)
|
|
df = pd.DataFrame([['A', 10], ['B', 15]], columns=[
|
|
'metric', 'values'
|
|
], index=index)
|
|
result = df.groupby([pd.Grouper(level=0, freq=freq), 'metric']).mean()
|
|
expected = df.set_index([df.index, 'metric'])
|
|
|
|
assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_groupby_one_row():
|
|
# GH 11741
|
|
df1 = pd.DataFrame(np.random.randn(1, 4), columns=list('ABCD'))
|
|
pytest.raises(KeyError, df1.groupby, 'Z')
|
|
df2 = pd.DataFrame(np.random.randn(2, 4), columns=list('ABCD'))
|
|
pytest.raises(KeyError, df2.groupby, 'Z')
|
|
|
|
|
|
def test_groupby_nat_exclude():
|
|
# GH 6992
|
|
df = pd.DataFrame(
|
|
{'values': np.random.randn(8),
|
|
'dt': [np.nan, pd.Timestamp('2013-01-01'), np.nan, pd.Timestamp(
|
|
'2013-02-01'), np.nan, pd.Timestamp('2013-02-01'), np.nan,
|
|
pd.Timestamp('2013-01-01')],
|
|
'str': [np.nan, 'a', np.nan, 'a', np.nan, 'a', np.nan, 'b']})
|
|
grouped = df.groupby('dt')
|
|
|
|
expected = [pd.Index([1, 7]), pd.Index([3, 5])]
|
|
keys = sorted(grouped.groups.keys())
|
|
assert len(keys) == 2
|
|
for k, e in zip(keys, expected):
|
|
# grouped.groups keys are np.datetime64 with system tz
|
|
# not to be affected by tz, only compare values
|
|
tm.assert_index_equal(grouped.groups[k], e)
|
|
|
|
# confirm obj is not filtered
|
|
tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
|
|
assert grouped.ngroups == 2
|
|
|
|
expected = {
|
|
Timestamp('2013-01-01 00:00:00'): np.array([1, 7], dtype=np.int64),
|
|
Timestamp('2013-02-01 00:00:00'): np.array([3, 5], dtype=np.int64)
|
|
}
|
|
|
|
for k in grouped.indices:
|
|
tm.assert_numpy_array_equal(grouped.indices[k], expected[k])
|
|
|
|
tm.assert_frame_equal(
|
|
grouped.get_group(Timestamp('2013-01-01')), df.iloc[[1, 7]])
|
|
tm.assert_frame_equal(
|
|
grouped.get_group(Timestamp('2013-02-01')), df.iloc[[3, 5]])
|
|
|
|
pytest.raises(KeyError, grouped.get_group, pd.NaT)
|
|
|
|
nan_df = DataFrame({'nan': [np.nan, np.nan, np.nan],
|
|
'nat': [pd.NaT, pd.NaT, pd.NaT]})
|
|
assert nan_df['nan'].dtype == 'float64'
|
|
assert nan_df['nat'].dtype == 'datetime64[ns]'
|
|
|
|
for key in ['nan', 'nat']:
|
|
grouped = nan_df.groupby(key)
|
|
assert grouped.groups == {}
|
|
assert grouped.ngroups == 0
|
|
assert grouped.indices == {}
|
|
pytest.raises(KeyError, grouped.get_group, np.nan)
|
|
pytest.raises(KeyError, grouped.get_group, pd.NaT)
|
|
|
|
|
|
def test_sparse_friendly(df):
|
|
sdf = df[['C', 'D']].to_sparse()
|
|
with catch_warnings(record=True):
|
|
panel = tm.makePanel()
|
|
tm.add_nans(panel)
|
|
|
|
def _check_work(gp):
|
|
gp.mean()
|
|
gp.agg(np.mean)
|
|
dict(iter(gp))
|
|
|
|
# it works!
|
|
_check_work(sdf.groupby(lambda x: x // 2))
|
|
_check_work(sdf['C'].groupby(lambda x: x // 2))
|
|
_check_work(sdf.groupby(df['A']))
|
|
|
|
# do this someday
|
|
# _check_work(panel.groupby(lambda x: x.month, axis=1))
|
|
|
|
|
|
def test_panel_groupby():
|
|
with catch_warnings(record=True):
|
|
panel = tm.makePanel()
|
|
tm.add_nans(panel)
|
|
grouped = panel.groupby({'ItemA': 0, 'ItemB': 0, 'ItemC': 1},
|
|
axis='items')
|
|
agged = grouped.mean()
|
|
agged2 = grouped.agg(lambda x: x.mean('items'))
|
|
|
|
tm.assert_panel_equal(agged, agged2)
|
|
|
|
tm.assert_index_equal(agged.items, Index([0, 1]))
|
|
|
|
grouped = panel.groupby(lambda x: x.month, axis='major')
|
|
agged = grouped.mean()
|
|
|
|
exp = Index(sorted(list(set(panel.major_axis.month))))
|
|
tm.assert_index_equal(agged.major_axis, exp)
|
|
|
|
grouped = panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
|
|
axis='minor')
|
|
agged = grouped.mean()
|
|
tm.assert_index_equal(agged.minor_axis, Index([0, 1]))
|
|
|
|
|
|
def test_groupby_2d_malformed():
|
|
d = DataFrame(index=lrange(2))
|
|
d['group'] = ['g1', 'g2']
|
|
d['zeros'] = [0, 0]
|
|
d['ones'] = [1, 1]
|
|
d['label'] = ['l1', 'l2']
|
|
tmp = d.groupby(['group']).mean()
|
|
res_values = np.array([[0, 1], [0, 1]], dtype=np.int64)
|
|
tm.assert_index_equal(tmp.columns, Index(['zeros', 'ones']))
|
|
tm.assert_numpy_array_equal(tmp.values, res_values)
|
|
|
|
|
|
def test_int32_overflow():
|
|
B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000)
|
|
))
|
|
A = np.arange(25000)
|
|
df = DataFrame({'A': A,
|
|
'B': B,
|
|
'C': A,
|
|
'D': B,
|
|
'E': np.random.randn(25000)})
|
|
|
|
left = df.groupby(['A', 'B', 'C', 'D']).sum()
|
|
right = df.groupby(['D', 'C', 'B', 'A']).sum()
|
|
assert len(left) == len(right)
|
|
|
|
|
|
def test_groupby_sort_multi():
|
|
df = DataFrame({'a': ['foo', 'bar', 'baz'],
|
|
'b': [3, 2, 1],
|
|
'c': [0, 1, 2],
|
|
'd': np.random.randn(3)})
|
|
|
|
tups = lmap(tuple, df[['a', 'b', 'c']].values)
|
|
tups = com._asarray_tuplesafe(tups)
|
|
result = df.groupby(['a', 'b', 'c'], sort=True).sum()
|
|
tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]])
|
|
|
|
tups = lmap(tuple, df[['c', 'a', 'b']].values)
|
|
tups = com._asarray_tuplesafe(tups)
|
|
result = df.groupby(['c', 'a', 'b'], sort=True).sum()
|
|
tm.assert_numpy_array_equal(result.index.values, tups)
|
|
|
|
tups = lmap(tuple, df[['b', 'c', 'a']].values)
|
|
tups = com._asarray_tuplesafe(tups)
|
|
result = df.groupby(['b', 'c', 'a'], sort=True).sum()
|
|
tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]])
|
|
|
|
df = DataFrame({'a': [0, 1, 2, 0, 1, 2],
|
|
'b': [0, 0, 0, 1, 1, 1],
|
|
'd': np.random.randn(6)})
|
|
grouped = df.groupby(['a', 'b'])['d']
|
|
result = grouped.sum()
|
|
|
|
def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
|
|
tups = lmap(tuple, df[keys].values)
|
|
tups = com._asarray_tuplesafe(tups)
|
|
expected = f(df.groupby(tups)[field])
|
|
for k, v in compat.iteritems(expected):
|
|
assert (result[k] == v)
|
|
|
|
_check_groupby(df, result, ['a', 'b'], 'd')
|
|
|
|
|
|
def test_dont_clobber_name_column():
|
|
df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'],
|
|
'name': ['foo', 'bar', 'baz'] * 2})
|
|
|
|
result = df.groupby('key').apply(lambda x: x)
|
|
assert_frame_equal(result, df)
|
|
|
|
|
|
def test_skip_group_keys():
|
|
|
|
tsf = tm.makeTimeDataFrame()
|
|
|
|
grouped = tsf.groupby(lambda x: x.month, group_keys=False)
|
|
result = grouped.apply(lambda x: x.sort_values(by='A')[:3])
|
|
|
|
pieces = []
|
|
for key, group in grouped:
|
|
pieces.append(group.sort_values(by='A')[:3])
|
|
|
|
expected = pd.concat(pieces)
|
|
assert_frame_equal(result, expected)
|
|
|
|
grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False)
|
|
result = grouped.apply(lambda x: x.sort_values()[:3])
|
|
|
|
pieces = []
|
|
for key, group in grouped:
|
|
pieces.append(group.sort_values()[:3])
|
|
|
|
expected = pd.concat(pieces)
|
|
assert_series_equal(result, expected)
|
|
|
|
|
|
def test_no_nonsense_name(frame):
|
|
# GH #995
|
|
s = frame['C'].copy()
|
|
s.name = None
|
|
|
|
result = s.groupby(frame['A']).agg(np.sum)
|
|
assert result.name is None
|
|
|
|
|
|
def test_multifunc_sum_bug():
|
|
# GH #1065
|
|
x = DataFrame(np.arange(9).reshape(3, 3))
|
|
x['test'] = 0
|
|
x['fl'] = [1.3, 1.5, 1.6]
|
|
|
|
grouped = x.groupby('test')
|
|
result = grouped.agg({'fl': 'sum', 2: 'size'})
|
|
assert result['fl'].dtype == np.float64
|
|
|
|
|
|
def test_handle_dict_return_value(df):
|
|
def f(group):
|
|
return {'max': group.max(), 'min': group.min()}
|
|
|
|
def g(group):
|
|
return Series({'max': group.max(), 'min': group.min()})
|
|
|
|
result = df.groupby('A')['C'].apply(f)
|
|
expected = df.groupby('A')['C'].apply(g)
|
|
|
|
assert isinstance(result, Series)
|
|
assert_series_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize('grouper', ['A', ['A', 'B']])
|
|
def test_set_group_name(df, grouper):
|
|
def f(group):
|
|
assert group.name is not None
|
|
return group
|
|
|
|
def freduce(group):
|
|
assert group.name is not None
|
|
return group.sum()
|
|
|
|
def foo(x):
|
|
return freduce(x)
|
|
|
|
grouped = df.groupby(grouper)
|
|
|
|
# make sure all these work
|
|
grouped.apply(f)
|
|
grouped.aggregate(freduce)
|
|
grouped.aggregate({'C': freduce, 'D': freduce})
|
|
grouped.transform(f)
|
|
|
|
grouped['C'].apply(f)
|
|
grouped['C'].aggregate(freduce)
|
|
grouped['C'].aggregate([freduce, foo])
|
|
grouped['C'].transform(f)
|
|
|
|
|
|
def test_group_name_available_in_inference_pass():
|
|
# gh-15062
|
|
df = pd.DataFrame({'a': [0, 0, 1, 1, 2, 2], 'b': np.arange(6)})
|
|
|
|
names = []
|
|
|
|
def f(group):
|
|
names.append(group.name)
|
|
return group.copy()
|
|
|
|
df.groupby('a', sort=False, group_keys=False).apply(f)
|
|
# we expect 2 zeros because we call ``f`` once to see if a faster route
|
|
# can be used.
|
|
expected_names = [0, 0, 1, 2]
|
|
assert names == expected_names
|
|
|
|
|
|
def test_no_dummy_key_names(df):
|
|
# see gh-1291
|
|
result = df.groupby(df['A'].values).sum()
|
|
assert result.index.name is None
|
|
|
|
result = df.groupby([df['A'].values, df['B'].values]).sum()
|
|
assert result.index.names == (None, None)
|
|
|
|
|
|
def test_groupby_sort_multiindex_series():
|
|
# series multiindex groupby sort argument was not being passed through
|
|
# _compress_group_index
|
|
# GH 9444
|
|
index = MultiIndex(levels=[[1, 2], [1, 2]],
|
|
labels=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]],
|
|
names=['a', 'b'])
|
|
mseries = Series([0, 1, 2, 3, 4, 5], index=index)
|
|
index = MultiIndex(levels=[[1, 2], [1, 2]],
|
|
labels=[[0, 0, 1], [1, 0, 0]], names=['a', 'b'])
|
|
mseries_result = Series([0, 2, 4], index=index)
|
|
|
|
result = mseries.groupby(level=['a', 'b'], sort=False).first()
|
|
assert_series_equal(result, mseries_result)
|
|
result = mseries.groupby(level=['a', 'b'], sort=True).first()
|
|
assert_series_equal(result, mseries_result.sort_index())
|
|
|
|
|
|
def test_groupby_reindex_inside_function():
|
|
|
|
periods = 1000
|
|
ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods)
|
|
df = DataFrame({'high': np.arange(
|
|
periods), 'low': np.arange(periods)}, index=ind)
|
|
|
|
def agg_before(hour, func, fix=False):
|
|
"""
|
|
Run an aggregate func on the subset of data.
|
|
"""
|
|
|
|
def _func(data):
|
|
d = data.loc[data.index.map(
|
|
lambda x: x.hour < 11)].dropna()
|
|
if fix:
|
|
data[data.index[0]]
|
|
if len(d) == 0:
|
|
return None
|
|
return func(d)
|
|
|
|
return _func
|
|
|
|
def afunc(data):
|
|
d = data.select(lambda x: x.hour < 11).dropna()
|
|
return np.max(d)
|
|
|
|
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
|
|
closure_bad = grouped.agg({'high': agg_before(11, np.max)})
|
|
closure_good = grouped.agg({'high': agg_before(11, np.max, True)})
|
|
|
|
assert_frame_equal(closure_bad, closure_good)
|
|
|
|
|
|
def test_groupby_multiindex_missing_pair():
|
|
# GH9049
|
|
df = DataFrame({'group1': ['a', 'a', 'a', 'b'],
|
|
'group2': ['c', 'c', 'd', 'c'],
|
|
'value': [1, 1, 1, 5]})
|
|
df = df.set_index(['group1', 'group2'])
|
|
df_grouped = df.groupby(level=['group1', 'group2'], sort=True)
|
|
|
|
res = df_grouped.agg('sum')
|
|
idx = MultiIndex.from_tuples(
|
|
[('a', 'c'), ('a', 'd'), ('b', 'c')], names=['group1', 'group2'])
|
|
exp = DataFrame([[2], [1], [5]], index=idx, columns=['value'])
|
|
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
|
|
def test_groupby_multiindex_not_lexsorted():
|
|
# GH 11640
|
|
|
|
# define the lexsorted version
|
|
lexsorted_mi = MultiIndex.from_tuples(
|
|
[('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
|
|
lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
|
|
assert lexsorted_df.columns.is_lexsorted()
|
|
|
|
# define the non-lexsorted version
|
|
not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
|
|
data=[[1, 'b1', 'c1', 3],
|
|
[1, 'b2', 'c2', 4]])
|
|
not_lexsorted_df = not_lexsorted_df.pivot_table(
|
|
index='a', columns=['b', 'c'], values='d')
|
|
not_lexsorted_df = not_lexsorted_df.reset_index()
|
|
assert not not_lexsorted_df.columns.is_lexsorted()
|
|
|
|
# compare the results
|
|
tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
|
|
|
|
expected = lexsorted_df.groupby('a').mean()
|
|
with tm.assert_produces_warning(PerformanceWarning):
|
|
result = not_lexsorted_df.groupby('a').mean()
|
|
tm.assert_frame_equal(expected, result)
|
|
|
|
# a transforming function should work regardless of sort
|
|
# GH 14776
|
|
df = DataFrame({'x': ['a', 'a', 'b', 'a'],
|
|
'y': [1, 1, 2, 2],
|
|
'z': [1, 2, 3, 4]}).set_index(['x', 'y'])
|
|
assert not df.index.is_lexsorted()
|
|
|
|
for level in [0, 1, [0, 1]]:
|
|
for sort in [False, True]:
|
|
result = df.groupby(level=level, sort=sort).apply(
|
|
DataFrame.drop_duplicates)
|
|
expected = df
|
|
tm.assert_frame_equal(expected, result)
|
|
|
|
result = df.sort_index().groupby(level=level, sort=sort).apply(
|
|
DataFrame.drop_duplicates)
|
|
expected = df.sort_index()
|
|
tm.assert_frame_equal(expected, result)
|
|
|
|
|
|
def test_index_label_overlaps_location():
|
|
# checking we don't have any label/location confusion in the
|
|
# the wake of GH5375
|
|
df = DataFrame(list('ABCDE'), index=[2, 0, 2, 1, 1])
|
|
g = df.groupby(list('ababb'))
|
|
actual = g.filter(lambda x: len(x) > 2)
|
|
expected = df.iloc[[1, 3, 4]]
|
|
assert_frame_equal(actual, expected)
|
|
|
|
ser = df[0]
|
|
g = ser.groupby(list('ababb'))
|
|
actual = g.filter(lambda x: len(x) > 2)
|
|
expected = ser.take([1, 3, 4])
|
|
assert_series_equal(actual, expected)
|
|
|
|
# ... and again, with a generic Index of floats
|
|
df.index = df.index.astype(float)
|
|
g = df.groupby(list('ababb'))
|
|
actual = g.filter(lambda x: len(x) > 2)
|
|
expected = df.iloc[[1, 3, 4]]
|
|
assert_frame_equal(actual, expected)
|
|
|
|
ser = df[0]
|
|
g = ser.groupby(list('ababb'))
|
|
actual = g.filter(lambda x: len(x) > 2)
|
|
expected = ser.take([1, 3, 4])
|
|
assert_series_equal(actual, expected)
|
|
|
|
|
|
def test_transform_doesnt_clobber_ints():
|
|
# GH 7972
|
|
n = 6
|
|
x = np.arange(n)
|
|
df = DataFrame({'a': x // 2, 'b': 2.0 * x, 'c': 3.0 * x})
|
|
df2 = DataFrame({'a': x // 2 * 1.0, 'b': 2.0 * x, 'c': 3.0 * x})
|
|
|
|
gb = df.groupby('a')
|
|
result = gb.transform('mean')
|
|
|
|
gb2 = df2.groupby('a')
|
|
expected = gb2.transform('mean')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize('sort_column', ['ints', 'floats', 'strings',
|
|
['ints', 'floats'],
|
|
['ints', 'strings']])
|
|
@pytest.mark.parametrize('group_column', ['int_groups', 'string_groups',
|
|
['int_groups', 'string_groups']])
|
|
def test_groupby_preserves_sort(sort_column, group_column):
|
|
# Test to ensure that groupby always preserves sort order of original
|
|
# object. Issue #8588 and #9651
|
|
|
|
df = DataFrame(
|
|
{'int_groups': [3, 1, 0, 1, 0, 3, 3, 3],
|
|
'string_groups': ['z', 'a', 'z', 'a', 'a', 'g', 'g', 'g'],
|
|
'ints': [8, 7, 4, 5, 2, 9, 1, 1],
|
|
'floats': [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5],
|
|
'strings': ['z', 'd', 'a', 'e', 'word', 'word2', '42', '47']})
|
|
|
|
# Try sorting on different types and with different group types
|
|
|
|
df = df.sort_values(by=sort_column)
|
|
g = df.groupby(group_column)
|
|
|
|
def test_sort(x):
|
|
assert_frame_equal(x, x.sort_values(by=sort_column))
|
|
g.apply(test_sort)
|
|
|
|
|
|
def test_group_shift_with_null_key():
|
|
# This test is designed to replicate the segfault in issue #13813.
|
|
n_rows = 1200
|
|
|
|
# Generate a moderately large dataframe with occasional missing
|
|
# values in column `B`, and then group by [`A`, `B`]. This should
|
|
# force `-1` in `labels` array of `g.grouper.group_info` exactly
|
|
# at those places, where the group-by key is partially missing.
|
|
df = DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i)
|
|
for i in range(n_rows)], dtype=float,
|
|
columns=["A", "B", "Z"], index=None)
|
|
g = df.groupby(["A", "B"])
|
|
|
|
expected = DataFrame([(i + 12 if i % 3 and i < n_rows - 12
|
|
else np.nan)
|
|
for i in range(n_rows)], dtype=float,
|
|
columns=["Z"], index=None)
|
|
result = g.shift(-1)
|
|
|
|
assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_pivot_table_values_key_error():
|
|
# This test is designed to replicate the error in issue #14938
|
|
df = pd.DataFrame({'eventDate':
|
|
pd.date_range(pd.datetime.today(),
|
|
periods=20, freq='M').tolist(),
|
|
'thename': range(0, 20)})
|
|
|
|
df['year'] = df.set_index('eventDate').index.year
|
|
df['month'] = df.set_index('eventDate').index.month
|
|
|
|
with pytest.raises(KeyError):
|
|
df.reset_index().pivot_table(index='year', columns='month',
|
|
values='badname', aggfunc='count')
|
|
|
|
|
|
def test_empty_dataframe_groupby():
|
|
# GH8093
|
|
df = DataFrame(columns=['A', 'B', 'C'])
|
|
|
|
result = df.groupby('A').sum()
|
|
expected = DataFrame(columns=['B', 'C'], dtype=np.float64)
|
|
expected.index.name = 'A'
|
|
|
|
assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_tuple_warns():
|
|
# https://github.com/pandas-dev/pandas/issues/18314
|
|
df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2],
|
|
'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]})
|
|
with tm.assert_produces_warning(FutureWarning) as w:
|
|
df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean()
|
|
|
|
assert "Interpreting tuple 'by' as a list" in str(w[0].message)
|
|
|
|
with tm.assert_produces_warning(None):
|
|
df.groupby(('a', 'b')).c.mean()
|
|
|
|
|
|
def test_tuple_warns_unhashable():
|
|
# https://github.com/pandas-dev/pandas/issues/18314
|
|
business_dates = date_range(start='4/1/2014', end='6/30/2014',
|
|
freq='B')
|
|
df = DataFrame(1, index=business_dates, columns=['a', 'b'])
|
|
|
|
with tm.assert_produces_warning(FutureWarning) as w:
|
|
df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])
|
|
|
|
assert "Interpreting tuple 'by' as a list" in str(w[0].message)
|
|
|
|
|
|
def test_tuple_correct_keyerror():
|
|
# https://github.com/pandas-dev/pandas/issues/18798
|
|
df = pd.DataFrame(1, index=range(3),
|
|
columns=pd.MultiIndex.from_product([[1, 2],
|
|
[3, 4]]))
|
|
with tm.assert_raises_regex(KeyError, "(7, 8)"):
|
|
df.groupby((7, 8)).mean()
|
|
|
|
|
|
def test_groupby_agg_ohlc_non_first():
|
|
# GH 21716
|
|
df = pd.DataFrame([[1], [1]], columns=['foo'],
|
|
index=pd.date_range('2018-01-01', periods=2, freq='D'))
|
|
|
|
expected = pd.DataFrame([
|
|
[1, 1, 1, 1, 1],
|
|
[1, 1, 1, 1, 1]
|
|
], columns=pd.MultiIndex.from_tuples((
|
|
('foo', 'ohlc', 'open'), ('foo', 'ohlc', 'high'),
|
|
('foo', 'ohlc', 'low'), ('foo', 'ohlc', 'close'),
|
|
('foo', 'sum', 'foo'))), index=pd.date_range(
|
|
'2018-01-01', periods=2, freq='D'))
|
|
|
|
result = df.groupby(pd.Grouper(freq='D')).agg(['sum', 'ohlc'])
|
|
|
|
tm.assert_frame_equal(result, expected)
|