218 lines
6.5 KiB
Python
218 lines
6.5 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
"""
|
||
|
test cython .agg behavior
|
||
|
"""
|
||
|
|
||
|
from __future__ import print_function
|
||
|
|
||
|
import pytest
|
||
|
|
||
|
import numpy as np
|
||
|
from numpy import nan
|
||
|
import pandas as pd
|
||
|
|
||
|
from pandas import (bdate_range, DataFrame, Index, Series, Timestamp,
|
||
|
Timedelta, NaT)
|
||
|
from pandas.core.groupby.groupby import DataError
|
||
|
import pandas.util.testing as tm
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize('op_name', [
|
||
|
'count',
|
||
|
'sum',
|
||
|
'std',
|
||
|
'var',
|
||
|
'sem',
|
||
|
'mean',
|
||
|
'median',
|
||
|
'prod',
|
||
|
'min',
|
||
|
'max',
|
||
|
])
|
||
|
def test_cythonized_aggers(op_name):
|
||
|
data = {'A': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan],
|
||
|
'B': ['A', 'B'] * 6,
|
||
|
'C': np.random.randn(12)}
|
||
|
df = DataFrame(data)
|
||
|
df.loc[2:10:2, 'C'] = nan
|
||
|
|
||
|
op = lambda x: getattr(x, op_name)()
|
||
|
|
||
|
# single column
|
||
|
grouped = df.drop(['B'], axis=1).groupby('A')
|
||
|
exp = {}
|
||
|
for cat, group in grouped:
|
||
|
exp[cat] = op(group['C'])
|
||
|
exp = DataFrame({'C': exp})
|
||
|
exp.index.name = 'A'
|
||
|
result = op(grouped)
|
||
|
tm.assert_frame_equal(result, exp)
|
||
|
|
||
|
# multiple columns
|
||
|
grouped = df.groupby(['A', 'B'])
|
||
|
expd = {}
|
||
|
for (cat1, cat2), group in grouped:
|
||
|
expd.setdefault(cat1, {})[cat2] = op(group['C'])
|
||
|
exp = DataFrame(expd).T.stack(dropna=False)
|
||
|
exp.index.names = ['A', 'B']
|
||
|
exp.name = 'C'
|
||
|
|
||
|
result = op(grouped)['C']
|
||
|
if op_name in ['sum', 'prod']:
|
||
|
tm.assert_series_equal(result, exp)
|
||
|
|
||
|
|
||
|
def test_cython_agg_boolean():
|
||
|
frame = DataFrame({'a': np.random.randint(0, 5, 50),
|
||
|
'b': np.random.randint(0, 2, 50).astype('bool')})
|
||
|
result = frame.groupby('a')['b'].mean()
|
||
|
expected = frame.groupby('a')['b'].agg(np.mean)
|
||
|
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_cython_agg_nothing_to_agg():
|
||
|
frame = DataFrame({'a': np.random.randint(0, 5, 50),
|
||
|
'b': ['foo', 'bar'] * 25})
|
||
|
msg = "No numeric types to aggregate"
|
||
|
|
||
|
with tm.assert_raises_regex(DataError, msg):
|
||
|
frame.groupby('a')['b'].mean()
|
||
|
|
||
|
frame = DataFrame({'a': np.random.randint(0, 5, 50),
|
||
|
'b': ['foo', 'bar'] * 25})
|
||
|
with tm.assert_raises_regex(DataError, msg):
|
||
|
frame[['b']].groupby(frame['a']).mean()
|
||
|
|
||
|
|
||
|
def test_cython_agg_nothing_to_agg_with_dates():
|
||
|
frame = DataFrame({'a': np.random.randint(0, 5, 50),
|
||
|
'b': ['foo', 'bar'] * 25,
|
||
|
'dates': pd.date_range('now', periods=50, freq='T')})
|
||
|
msg = "No numeric types to aggregate"
|
||
|
with tm.assert_raises_regex(DataError, msg):
|
||
|
frame.groupby('b').dates.mean()
|
||
|
|
||
|
|
||
|
def test_cython_agg_frame_columns():
|
||
|
# #2113
|
||
|
df = DataFrame({'x': [1, 2, 3], 'y': [3, 4, 5]})
|
||
|
|
||
|
df.groupby(level=0, axis='columns').mean()
|
||
|
df.groupby(level=0, axis='columns').mean()
|
||
|
df.groupby(level=0, axis='columns').mean()
|
||
|
df.groupby(level=0, axis='columns').mean()
|
||
|
|
||
|
|
||
|
def test_cython_agg_return_dict():
|
||
|
# GH 16741
|
||
|
df = DataFrame(
|
||
|
{'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
|
||
|
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
|
||
|
'C': np.random.randn(8),
|
||
|
'D': np.random.randn(8)})
|
||
|
|
||
|
ts = df.groupby('A')['B'].agg(lambda x: x.value_counts().to_dict())
|
||
|
expected = Series([{'two': 1, 'one': 1, 'three': 1},
|
||
|
{'two': 2, 'one': 2, 'three': 1}],
|
||
|
index=Index(['bar', 'foo'], name='A'),
|
||
|
name='B')
|
||
|
tm.assert_series_equal(ts, expected)
|
||
|
|
||
|
|
||
|
def test_cython_fail_agg():
|
||
|
dr = bdate_range('1/1/2000', periods=50)
|
||
|
ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr)
|
||
|
|
||
|
grouped = ts.groupby(lambda x: x.month)
|
||
|
summed = grouped.sum()
|
||
|
expected = grouped.agg(np.sum)
|
||
|
tm.assert_series_equal(summed, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize('op, targop', [
|
||
|
('mean', np.mean),
|
||
|
('median', np.median),
|
||
|
('var', np.var),
|
||
|
('add', np.sum),
|
||
|
('prod', np.prod),
|
||
|
('min', np.min),
|
||
|
('max', np.max),
|
||
|
('first', lambda x: x.iloc[0]),
|
||
|
('last', lambda x: x.iloc[-1]),
|
||
|
])
|
||
|
def test__cython_agg_general(op, targop):
|
||
|
df = DataFrame(np.random.randn(1000))
|
||
|
labels = np.random.randint(0, 50, size=1000).astype(float)
|
||
|
|
||
|
result = df.groupby(labels)._cython_agg_general(op)
|
||
|
expected = df.groupby(labels).agg(targop)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize('op, targop', [
|
||
|
('mean', np.mean),
|
||
|
('median', lambda x: np.median(x) if len(x) > 0 else np.nan),
|
||
|
('var', lambda x: np.var(x, ddof=1)),
|
||
|
('min', np.min),
|
||
|
('max', np.max), ]
|
||
|
)
|
||
|
def test_cython_agg_empty_buckets(op, targop, observed):
|
||
|
df = pd.DataFrame([11, 12, 13])
|
||
|
grps = range(0, 55, 5)
|
||
|
|
||
|
# calling _cython_agg_general directly, instead of via the user API
|
||
|
# which sets different values for min_count, so do that here.
|
||
|
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||
|
result = g._cython_agg_general(op)
|
||
|
|
||
|
g = df.groupby(pd.cut(df[0], grps), observed=observed)
|
||
|
expected = g.agg(lambda x: targop(x))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
def test_cython_agg_empty_buckets_nanops(observed):
|
||
|
# GH-18869 can't call nanops on empty groups, so hardcode expected
|
||
|
# for these
|
||
|
df = pd.DataFrame([11, 12, 13], columns=['a'])
|
||
|
grps = range(0, 25, 5)
|
||
|
# add / sum
|
||
|
result = df.groupby(pd.cut(df['a'], grps),
|
||
|
observed=observed)._cython_agg_general('add')
|
||
|
intervals = pd.interval_range(0, 20, freq=5)
|
||
|
expected = pd.DataFrame(
|
||
|
{"a": [0, 0, 36, 0]},
|
||
|
index=pd.CategoricalIndex(intervals, name='a', ordered=True))
|
||
|
if observed:
|
||
|
expected = expected[expected.a != 0]
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
# prod
|
||
|
result = df.groupby(pd.cut(df['a'], grps),
|
||
|
observed=observed)._cython_agg_general('prod')
|
||
|
expected = pd.DataFrame(
|
||
|
{"a": [1, 1, 1716, 1]},
|
||
|
index=pd.CategoricalIndex(intervals, name='a', ordered=True))
|
||
|
if observed:
|
||
|
expected = expected[expected.a != 1]
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
@pytest.mark.parametrize('op', ['first', 'last', 'max', 'min'])
|
||
|
@pytest.mark.parametrize('data', [
|
||
|
Timestamp('2016-10-14 21:00:44.557'),
|
||
|
Timedelta('17088 days 21:00:44.557'), ])
|
||
|
def test_cython_with_timestamp_and_nat(op, data):
|
||
|
# https://github.com/pandas-dev/pandas/issues/19526
|
||
|
df = DataFrame({'a': [0, 1], 'b': [data, NaT]})
|
||
|
index = Index([0, 1], name='a')
|
||
|
|
||
|
# We will group by a and test the cython aggregations
|
||
|
expected = DataFrame({'b': [data, NaT]}, index=index)
|
||
|
|
||
|
result = df.groupby('a').aggregate(op)
|
||
|
tm.assert_frame_equal(expected, result)
|