530 lines
17 KiB
Python
530 lines
17 KiB
Python
import pytest
|
|
import numpy as np
|
|
import pandas as pd
|
|
from datetime import datetime
|
|
from pandas.util import testing as tm
|
|
from pandas import DataFrame, MultiIndex, compat, Series, bdate_range, Index
|
|
|
|
|
|
def test_apply_issues():
|
|
# GH 5788
|
|
|
|
s = """2011.05.16,00:00,1.40893
|
|
2011.05.16,01:00,1.40760
|
|
2011.05.16,02:00,1.40750
|
|
2011.05.16,03:00,1.40649
|
|
2011.05.17,02:00,1.40893
|
|
2011.05.17,03:00,1.40760
|
|
2011.05.17,04:00,1.40750
|
|
2011.05.17,05:00,1.40649
|
|
2011.05.18,02:00,1.40893
|
|
2011.05.18,03:00,1.40760
|
|
2011.05.18,04:00,1.40750
|
|
2011.05.18,05:00,1.40649"""
|
|
|
|
df = pd.read_csv(
|
|
compat.StringIO(s), header=None, names=['date', 'time', 'value'],
|
|
parse_dates=[['date', 'time']])
|
|
df = df.set_index('date_time')
|
|
|
|
expected = df.groupby(df.index.date).idxmax()
|
|
result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# GH 5789
|
|
# don't auto coerce dates
|
|
df = pd.read_csv(
|
|
compat.StringIO(s), header=None, names=['date', 'time', 'value'])
|
|
exp_idx = pd.Index(
|
|
['2011.05.16', '2011.05.17', '2011.05.18'
|
|
], dtype=object, name='date')
|
|
expected = Series(['00:00', '02:00', '02:00'], index=exp_idx)
|
|
result = df.groupby('date').apply(
|
|
lambda x: x['time'][x['value'].idxmax()])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_apply_trivial():
|
|
# GH 20066
|
|
# trivial apply: ignore input and return a constant dataframe.
|
|
df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'],
|
|
'data': [1.0, 2.0, 3.0, 4.0, 5.0]},
|
|
columns=['key', 'data'])
|
|
expected = pd.concat([df.iloc[1:], df.iloc[1:]],
|
|
axis=1, keys=['float64', 'object'])
|
|
result = df.groupby([str(x) for x in df.dtypes],
|
|
axis=1).apply(lambda x: df.iloc[1:])
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.xfail(reason=("GH 20066; function passed into apply "
|
|
"returns a DataFrame with the same index "
|
|
"as the one to create GroupBy object."))
|
|
def test_apply_trivial_fail():
|
|
# GH 20066
|
|
# trivial apply fails if the constant dataframe has the same index
|
|
# with the one used to create GroupBy object.
|
|
df = pd.DataFrame({'key': ['a', 'a', 'b', 'b', 'a'],
|
|
'data': [1.0, 2.0, 3.0, 4.0, 5.0]},
|
|
columns=['key', 'data'])
|
|
expected = pd.concat([df, df],
|
|
axis=1, keys=['float64', 'object'])
|
|
result = df.groupby([str(x) for x in df.dtypes],
|
|
axis=1).apply(lambda x: df)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_fast_apply():
|
|
# make sure that fast apply is correctly called
|
|
# rather than raising any kind of error
|
|
# otherwise the python path will be callsed
|
|
# which slows things down
|
|
N = 1000
|
|
labels = np.random.randint(0, 2000, size=N)
|
|
labels2 = np.random.randint(0, 3, size=N)
|
|
df = DataFrame({'key': labels,
|
|
'key2': labels2,
|
|
'value1': np.random.randn(N),
|
|
'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)})
|
|
|
|
def f(g):
|
|
return 1
|
|
|
|
g = df.groupby(['key', 'key2'])
|
|
|
|
grouper = g.grouper
|
|
|
|
splitter = grouper._get_splitter(g._selected_obj, axis=g.axis)
|
|
group_keys = grouper._get_group_keys()
|
|
|
|
values, mutated = splitter.fast_apply(f, group_keys)
|
|
assert not mutated
|
|
|
|
|
|
def test_apply_with_mixed_dtype():
|
|
# GH3480, apply with mixed dtype on axis=1 breaks in 0.11
|
|
df = DataFrame({'foo1': np.random.randn(6),
|
|
'foo2': ['one', 'two', 'two', 'three', 'one', 'two']})
|
|
result = df.apply(lambda x: x, axis=1)
|
|
tm.assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts())
|
|
|
|
# GH 3610 incorrect dtype conversion with as_index=False
|
|
df = DataFrame({"c1": [1, 2, 6, 6, 8]})
|
|
df["c2"] = df.c1 / 2.0
|
|
result1 = df.groupby("c2").mean().reset_index().c2
|
|
result2 = df.groupby("c2", as_index=False).mean().c2
|
|
tm.assert_series_equal(result1, result2)
|
|
|
|
|
|
def test_groupby_as_index_apply(df):
|
|
# GH #4648 and #3417
|
|
df = DataFrame({'item_id': ['b', 'b', 'a', 'c', 'a', 'b'],
|
|
'user_id': [1, 2, 1, 1, 3, 1],
|
|
'time': range(6)})
|
|
|
|
g_as = df.groupby('user_id', as_index=True)
|
|
g_not_as = df.groupby('user_id', as_index=False)
|
|
|
|
res_as = g_as.head(2).index
|
|
res_not_as = g_not_as.head(2).index
|
|
exp = Index([0, 1, 2, 4])
|
|
tm.assert_index_equal(res_as, exp)
|
|
tm.assert_index_equal(res_not_as, exp)
|
|
|
|
res_as_apply = g_as.apply(lambda x: x.head(2)).index
|
|
res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
|
|
|
|
# apply doesn't maintain the original ordering
|
|
# changed in GH5610 as the as_index=False returns a MI here
|
|
exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (
|
|
2, 4)])
|
|
tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
|
|
exp_as_apply = MultiIndex.from_tuples(tp, names=['user_id', None])
|
|
|
|
tm.assert_index_equal(res_as_apply, exp_as_apply)
|
|
tm.assert_index_equal(res_not_as_apply, exp_not_as_apply)
|
|
|
|
ind = Index(list('abcde'))
|
|
df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
|
|
res = df.groupby(0, as_index=False).apply(lambda x: x).index
|
|
tm.assert_index_equal(res, ind)
|
|
|
|
|
|
def test_apply_concat_preserve_names(three_group):
|
|
grouped = three_group.groupby(['A', 'B'])
|
|
|
|
def desc(group):
|
|
result = group.describe()
|
|
result.index.name = 'stat'
|
|
return result
|
|
|
|
def desc2(group):
|
|
result = group.describe()
|
|
result.index.name = 'stat'
|
|
result = result[:len(group)]
|
|
# weirdo
|
|
return result
|
|
|
|
def desc3(group):
|
|
result = group.describe()
|
|
|
|
# names are different
|
|
result.index.name = 'stat_%d' % len(group)
|
|
|
|
result = result[:len(group)]
|
|
# weirdo
|
|
return result
|
|
|
|
result = grouped.apply(desc)
|
|
assert result.index.names == ('A', 'B', 'stat')
|
|
|
|
result2 = grouped.apply(desc2)
|
|
assert result2.index.names == ('A', 'B', 'stat')
|
|
|
|
result3 = grouped.apply(desc3)
|
|
assert result3.index.names == ('A', 'B', None)
|
|
|
|
|
|
def test_apply_series_to_frame():
|
|
def f(piece):
|
|
with np.errstate(invalid='ignore'):
|
|
logged = np.log(piece)
|
|
return DataFrame({'value': piece,
|
|
'demeaned': piece - piece.mean(),
|
|
'logged': logged})
|
|
|
|
dr = bdate_range('1/1/2000', periods=100)
|
|
ts = Series(np.random.randn(100), index=dr)
|
|
|
|
grouped = ts.groupby(lambda x: x.month)
|
|
result = grouped.apply(f)
|
|
|
|
assert isinstance(result, DataFrame)
|
|
tm.assert_index_equal(result.index, ts.index)
|
|
|
|
|
|
def test_apply_series_yield_constant(df):
|
|
result = df.groupby(['A', 'B'])['C'].apply(len)
|
|
assert result.index.names[:2] == ('A', 'B')
|
|
|
|
|
|
def test_apply_frame_yield_constant(df):
|
|
# GH13568
|
|
result = df.groupby(['A', 'B']).apply(len)
|
|
assert isinstance(result, Series)
|
|
assert result.name is None
|
|
|
|
result = df.groupby(['A', 'B'])[['C', 'D']].apply(len)
|
|
assert isinstance(result, Series)
|
|
assert result.name is None
|
|
|
|
|
|
def test_apply_frame_to_series(df):
|
|
grouped = df.groupby(['A', 'B'])
|
|
result = grouped.apply(len)
|
|
expected = grouped.count()['C']
|
|
tm.assert_index_equal(result.index, expected.index)
|
|
tm.assert_numpy_array_equal(result.values, expected.values)
|
|
|
|
|
|
def test_apply_frame_concat_series():
|
|
def trans(group):
|
|
return group.groupby('B')['C'].sum().sort_values()[:2]
|
|
|
|
def trans2(group):
|
|
grouped = group.groupby(df.reindex(group.index)['B'])
|
|
return grouped.sum().sort_values()[:2]
|
|
|
|
df = DataFrame({'A': np.random.randint(0, 5, 1000),
|
|
'B': np.random.randint(0, 5, 1000),
|
|
'C': np.random.randn(1000)})
|
|
|
|
result = df.groupby('A').apply(trans)
|
|
exp = df.groupby('A')['C'].apply(trans2)
|
|
tm.assert_series_equal(result, exp, check_names=False)
|
|
assert result.name == 'C'
|
|
|
|
|
|
def test_apply_transform(ts):
|
|
grouped = ts.groupby(lambda x: x.month)
|
|
result = grouped.apply(lambda x: x * 2)
|
|
expected = grouped.transform(lambda x: x * 2)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_apply_multikey_corner(tsframe):
|
|
grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
|
|
|
|
def f(group):
|
|
return group.sort_values('A')[-5:]
|
|
|
|
result = grouped.apply(f)
|
|
for key, group in grouped:
|
|
tm.assert_frame_equal(result.loc[key], f(group))
|
|
|
|
|
|
def test_apply_chunk_view():
|
|
# Low level tinkering could be unsafe, make sure not
|
|
df = DataFrame({'key': [1, 1, 1, 2, 2, 2, 3, 3, 3],
|
|
'value': compat.lrange(9)})
|
|
|
|
# return view
|
|
f = lambda x: x[:2]
|
|
|
|
result = df.groupby('key', group_keys=False).apply(f)
|
|
expected = df.take([0, 1, 3, 4, 6, 7])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_apply_no_name_column_conflict():
|
|
df = DataFrame({'name': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2],
|
|
'name2': [0, 0, 0, 1, 1, 1, 0, 0, 1, 1],
|
|
'value': compat.lrange(10)[::-1]})
|
|
|
|
# it works! #2605
|
|
grouped = df.groupby(['name', 'name2'])
|
|
grouped.apply(lambda x: x.sort_values('value', inplace=True))
|
|
|
|
|
|
def test_apply_typecast_fail():
|
|
df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
|
|
'c': np.tile(
|
|
['a', 'b', 'c'], 2),
|
|
'v': np.arange(1., 7.)})
|
|
|
|
def f(group):
|
|
v = group['v']
|
|
group['v2'] = (v - v.min()) / (v.max() - v.min())
|
|
return group
|
|
|
|
result = df.groupby('d').apply(f)
|
|
|
|
expected = df.copy()
|
|
expected['v2'] = np.tile([0., 0.5, 1], 2)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_apply_multiindex_fail():
|
|
index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]
|
|
])
|
|
df = DataFrame({'d': [1., 1., 1., 2., 2., 2.],
|
|
'c': np.tile(['a', 'b', 'c'], 2),
|
|
'v': np.arange(1., 7.)}, index=index)
|
|
|
|
def f(group):
|
|
v = group['v']
|
|
group['v2'] = (v - v.min()) / (v.max() - v.min())
|
|
return group
|
|
|
|
result = df.groupby('d').apply(f)
|
|
|
|
expected = df.copy()
|
|
expected['v2'] = np.tile([0., 0.5, 1], 2)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_apply_corner(tsframe):
|
|
result = tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2)
|
|
expected = tsframe * 2
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_apply_without_copy():
|
|
# GH 5545
|
|
# returning a non-copy in an applied function fails
|
|
|
|
data = DataFrame({'id_field': [100, 100, 200, 300],
|
|
'category': ['a', 'b', 'c', 'c'],
|
|
'value': [1, 2, 3, 4]})
|
|
|
|
def filt1(x):
|
|
if x.shape[0] == 1:
|
|
return x.copy()
|
|
else:
|
|
return x[x.category == 'c']
|
|
|
|
def filt2(x):
|
|
if x.shape[0] == 1:
|
|
return x
|
|
else:
|
|
return x[x.category == 'c']
|
|
|
|
expected = data.groupby('id_field').apply(filt1)
|
|
result = data.groupby('id_field').apply(filt2)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_apply_corner_cases():
|
|
# #535, can't use sliding iterator
|
|
|
|
N = 1000
|
|
labels = np.random.randint(0, 100, size=N)
|
|
df = DataFrame({'key': labels,
|
|
'value1': np.random.randn(N),
|
|
'value2': ['foo', 'bar', 'baz', 'qux'] * (N // 4)})
|
|
|
|
grouped = df.groupby('key')
|
|
|
|
def f(g):
|
|
g['value3'] = g['value1'] * 2
|
|
return g
|
|
|
|
result = grouped.apply(f)
|
|
assert 'value3' in result
|
|
|
|
|
|
def test_apply_numeric_coercion_when_datetime():
|
|
# In the past, group-by/apply operations have been over-eager
|
|
# in converting dtypes to numeric, in the presence of datetime
|
|
# columns. Various GH issues were filed, the reproductions
|
|
# for which are here.
|
|
|
|
# GH 15670
|
|
df = pd.DataFrame({'Number': [1, 2],
|
|
'Date': ["2017-03-02"] * 2,
|
|
'Str': ["foo", "inf"]})
|
|
expected = df.groupby(['Number']).apply(lambda x: x.iloc[0])
|
|
df.Date = pd.to_datetime(df.Date)
|
|
result = df.groupby(['Number']).apply(lambda x: x.iloc[0])
|
|
tm.assert_series_equal(result['Str'], expected['Str'])
|
|
|
|
# GH 15421
|
|
df = pd.DataFrame({'A': [10, 20, 30],
|
|
'B': ['foo', '3', '4'],
|
|
'T': [pd.Timestamp("12:31:22")] * 3})
|
|
|
|
def get_B(g):
|
|
return g.iloc[0][['B']]
|
|
result = df.groupby('A').apply(get_B)['B']
|
|
expected = df.B
|
|
expected.index = df.A
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# GH 14423
|
|
def predictions(tool):
|
|
out = pd.Series(index=['p1', 'p2', 'useTime'], dtype=object)
|
|
if 'step1' in list(tool.State):
|
|
out['p1'] = str(tool[tool.State == 'step1'].Machine.values[0])
|
|
if 'step2' in list(tool.State):
|
|
out['p2'] = str(tool[tool.State == 'step2'].Machine.values[0])
|
|
out['useTime'] = str(
|
|
tool[tool.State == 'step2'].oTime.values[0])
|
|
return out
|
|
df1 = pd.DataFrame({'Key': ['B', 'B', 'A', 'A'],
|
|
'State': ['step1', 'step2', 'step1', 'step2'],
|
|
'oTime': ['', '2016-09-19 05:24:33',
|
|
'', '2016-09-19 23:59:04'],
|
|
'Machine': ['23', '36L', '36R', '36R']})
|
|
df2 = df1.copy()
|
|
df2.oTime = pd.to_datetime(df2.oTime)
|
|
expected = df1.groupby('Key').apply(predictions).p1
|
|
result = df2.groupby('Key').apply(predictions).p1
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
|
|
def test_time_field_bug():
|
|
# Test a fix for the following error related to GH issue 11324 When
|
|
# non-key fields in a group-by dataframe contained time-based fields
|
|
# that were not returned by the apply function, an exception would be
|
|
# raised.
|
|
|
|
df = pd.DataFrame({'a': 1, 'b': [datetime.now() for nn in range(10)]})
|
|
|
|
def func_with_no_date(batch):
|
|
return pd.Series({'c': 2})
|
|
|
|
def func_with_date(batch):
|
|
return pd.Series({'b': datetime(2015, 1, 1), 'c': 2})
|
|
|
|
dfg_no_conversion = df.groupby(by=['a']).apply(func_with_no_date)
|
|
dfg_no_conversion_expected = pd.DataFrame({'c': 2}, index=[1])
|
|
dfg_no_conversion_expected.index.name = 'a'
|
|
|
|
dfg_conversion = df.groupby(by=['a']).apply(func_with_date)
|
|
dfg_conversion_expected = pd.DataFrame(
|
|
{'b': datetime(2015, 1, 1),
|
|
'c': 2}, index=[1])
|
|
dfg_conversion_expected.index.name = 'a'
|
|
|
|
tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)
|
|
tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected)
|
|
|
|
|
|
def test_gb_apply_list_of_unequal_len_arrays():
|
|
|
|
# GH1738
|
|
df = DataFrame({'group1': ['a', 'a', 'a', 'b', 'b', 'b', 'a', 'a', 'a',
|
|
'b', 'b', 'b'],
|
|
'group2': ['c', 'c', 'd', 'd', 'd', 'e', 'c', 'c', 'd',
|
|
'd', 'd', 'e'],
|
|
'weight': [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2],
|
|
'value': [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3]})
|
|
df = df.set_index(['group1', 'group2'])
|
|
df_grouped = df.groupby(level=['group1', 'group2'], sort=True)
|
|
|
|
def noddy(value, weight):
|
|
out = np.array(value * weight).repeat(3)
|
|
return out
|
|
|
|
# the kernel function returns arrays of unequal length
|
|
# pandas sniffs the first one, sees it's an array and not
|
|
# a list, and assumed the rest are of equal length
|
|
# and so tries a vstack
|
|
|
|
# don't die
|
|
df_grouped.apply(lambda x: noddy(x.value, x.weight))
|
|
|
|
|
|
def test_groupby_apply_all_none():
|
|
# Tests to make sure no errors if apply function returns all None
|
|
# values. Issue 9684.
|
|
test_df = DataFrame({'groups': [0, 0, 1, 1],
|
|
'random_vars': [8, 7, 4, 5]})
|
|
|
|
def test_func(x):
|
|
pass
|
|
|
|
result = test_df.groupby('groups').apply(test_func)
|
|
expected = DataFrame()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_groupby_apply_none_first():
|
|
# GH 12824. Tests if apply returns None first.
|
|
test_df1 = DataFrame({'groups': [1, 1, 1, 2], 'vars': [0, 1, 2, 3]})
|
|
test_df2 = DataFrame({'groups': [1, 2, 2, 2], 'vars': [0, 1, 2, 3]})
|
|
|
|
def test_func(x):
|
|
if x.shape[0] < 2:
|
|
return None
|
|
return x.iloc[[0, -1]]
|
|
|
|
result1 = test_df1.groupby('groups').apply(test_func)
|
|
result2 = test_df2.groupby('groups').apply(test_func)
|
|
index1 = MultiIndex.from_arrays([[1, 1], [0, 2]],
|
|
names=['groups', None])
|
|
index2 = MultiIndex.from_arrays([[2, 2], [1, 3]],
|
|
names=['groups', None])
|
|
expected1 = DataFrame({'groups': [1, 1], 'vars': [0, 2]},
|
|
index=index1)
|
|
expected2 = DataFrame({'groups': [2, 2], 'vars': [1, 3]},
|
|
index=index2)
|
|
tm.assert_frame_equal(result1, expected1)
|
|
tm.assert_frame_equal(result2, expected2)
|
|
|
|
|
|
def test_apply_with_mixed_types():
|
|
# gh-20949
|
|
df = pd.DataFrame({'A': 'a a b'.split(), 'B': [1, 2, 3], 'C': [4, 6, 5]})
|
|
g = df.groupby('A')
|
|
|
|
result = g.transform(lambda x: x / x.sum())
|
|
expected = pd.DataFrame({'B': [1 / 3., 2 / 3., 1], 'C': [0.4, 0.6, 1.0]})
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = g.apply(lambda x: x / x.sum())
|
|
tm.assert_frame_equal(result, expected)
|