872 lines
32 KiB
Python
872 lines
32 KiB
Python
# -*- coding: utf-8 -*-
|
|
from __future__ import print_function
|
|
from datetime import datetime
|
|
|
|
import pytest
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
from pandas.compat import PY37
|
|
from pandas import (Index, MultiIndex, CategoricalIndex,
|
|
DataFrame, Categorical, Series, qcut)
|
|
from pandas.util.testing import assert_frame_equal, assert_series_equal
|
|
import pandas.util.testing as tm
|
|
|
|
|
|
def cartesian_product_for_groupers(result, args, names):
|
|
""" Reindex to a cartesian production for the groupers,
|
|
preserving the nature (Categorical) of each grouper """
|
|
|
|
def f(a):
|
|
if isinstance(a, (CategoricalIndex, Categorical)):
|
|
categories = a.categories
|
|
a = Categorical.from_codes(np.arange(len(categories)),
|
|
categories=categories,
|
|
ordered=a.ordered)
|
|
return a
|
|
|
|
index = pd.MultiIndex.from_product(map(f, args), names=names)
|
|
return result.reindex(index).sort_index()
|
|
|
|
|
|
def test_apply_use_categorical_name(df):
|
|
cats = qcut(df.C, 4)
|
|
|
|
def get_stats(group):
|
|
return {'min': group.min(),
|
|
'max': group.max(),
|
|
'count': group.count(),
|
|
'mean': group.mean()}
|
|
|
|
result = df.groupby(cats, observed=False).D.apply(get_stats)
|
|
assert result.index.names[0] == 'C'
|
|
|
|
|
|
def test_basic():
|
|
|
|
cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
|
|
categories=["a", "b", "c", "d"], ordered=True)
|
|
data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
|
|
|
|
exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
|
|
expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
|
|
result = data.groupby("b", observed=False).mean()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
cat1 = Categorical(["a", "a", "b", "b"],
|
|
categories=["a", "b", "z"], ordered=True)
|
|
cat2 = Categorical(["c", "d", "c", "d"],
|
|
categories=["c", "d", "y"], ordered=True)
|
|
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
|
|
|
|
# single grouper
|
|
gb = df.groupby("A", observed=False)
|
|
exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
|
|
expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
|
|
result = gb.sum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# GH 8623
|
|
x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
|
|
[1, 'John P. Doe']],
|
|
columns=['person_id', 'person_name'])
|
|
x['person_name'] = Categorical(x.person_name)
|
|
|
|
g = x.groupby(['person_id'], observed=False)
|
|
result = g.transform(lambda x: x)
|
|
tm.assert_frame_equal(result, x[['person_name']])
|
|
|
|
result = x.drop_duplicates('person_name')
|
|
expected = x.iloc[[0, 1]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def f(x):
|
|
return x.drop_duplicates('person_name').iloc[0]
|
|
|
|
result = g.apply(f)
|
|
expected = x.iloc[[0, 1]].copy()
|
|
expected.index = Index([1, 2], name='person_id')
|
|
expected['person_name'] = expected['person_name'].astype('object')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# GH 9921
|
|
# Monotonic
|
|
df = DataFrame({"a": [5, 15, 25]})
|
|
c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
|
|
|
|
result = df.a.groupby(c, observed=False).transform(sum)
|
|
tm.assert_series_equal(result, df['a'])
|
|
|
|
tm.assert_series_equal(
|
|
df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
|
|
df['a'])
|
|
tm.assert_frame_equal(
|
|
df.groupby(c, observed=False).transform(sum),
|
|
df[['a']])
|
|
tm.assert_frame_equal(
|
|
df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
|
|
df[['a']])
|
|
|
|
# Filter
|
|
tm.assert_series_equal(
|
|
df.a.groupby(c, observed=False).filter(np.all),
|
|
df['a'])
|
|
tm.assert_frame_equal(
|
|
df.groupby(c, observed=False).filter(np.all),
|
|
df)
|
|
|
|
# Non-monotonic
|
|
df = DataFrame({"a": [5, 15, 25, -5]})
|
|
c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
|
|
|
|
result = df.a.groupby(c, observed=False).transform(sum)
|
|
tm.assert_series_equal(result, df['a'])
|
|
|
|
tm.assert_series_equal(
|
|
df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
|
|
df['a'])
|
|
tm.assert_frame_equal(
|
|
df.groupby(c, observed=False).transform(sum),
|
|
df[['a']])
|
|
tm.assert_frame_equal(
|
|
df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
|
|
df[['a']])
|
|
|
|
# GH 9603
|
|
df = DataFrame({'a': [1, 0, 0, 0]})
|
|
c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
|
|
result = df.groupby(c, observed=False).apply(len)
|
|
|
|
exp_index = CategoricalIndex(
|
|
c.values.categories, ordered=c.values.ordered)
|
|
expected = Series([1, 0, 0, 0], index=exp_index)
|
|
expected.index.name = 'a'
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# more basic
|
|
levels = ['foo', 'bar', 'baz', 'qux']
|
|
codes = np.random.randint(0, 4, size=100)
|
|
|
|
cats = Categorical.from_codes(codes, levels, ordered=True)
|
|
|
|
data = DataFrame(np.random.randn(100, 4))
|
|
|
|
result = data.groupby(cats, observed=False).mean()
|
|
|
|
expected = data.groupby(np.asarray(cats), observed=False).mean()
|
|
exp_idx = CategoricalIndex(levels, categories=cats.categories,
|
|
ordered=True)
|
|
expected = expected.reindex(exp_idx)
|
|
|
|
assert_frame_equal(result, expected)
|
|
|
|
grouped = data.groupby(cats, observed=False)
|
|
desc_result = grouped.describe()
|
|
|
|
idx = cats.codes.argsort()
|
|
ord_labels = np.asarray(cats).take(idx)
|
|
ord_data = data.take(idx)
|
|
|
|
exp_cats = Categorical(ord_labels, ordered=True,
|
|
categories=['foo', 'bar', 'baz', 'qux'])
|
|
expected = ord_data.groupby(
|
|
exp_cats, sort=False, observed=False).describe()
|
|
assert_frame_equal(desc_result, expected)
|
|
|
|
# GH 10460
|
|
expc = Categorical.from_codes(np.arange(4).repeat(8),
|
|
levels, ordered=True)
|
|
exp = CategoricalIndex(expc)
|
|
tm.assert_index_equal((desc_result.stack().index
|
|
.get_level_values(0)), exp)
|
|
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
|
|
'75%', 'max'] * 4)
|
|
tm.assert_index_equal((desc_result.stack().index
|
|
.get_level_values(1)), exp)
|
|
|
|
|
|
def test_level_get_group(observed):
|
|
# GH15155
|
|
df = DataFrame(data=np.arange(2, 22, 2),
|
|
index=MultiIndex(
|
|
levels=[pd.CategoricalIndex(["a", "b"]), range(10)],
|
|
labels=[[0] * 5 + [1] * 5, range(10)],
|
|
names=["Index1", "Index2"]))
|
|
g = df.groupby(level=["Index1"], observed=observed)
|
|
|
|
# expected should equal test.loc[["a"]]
|
|
# GH15166
|
|
expected = DataFrame(data=np.arange(2, 12, 2),
|
|
index=pd.MultiIndex(levels=[pd.CategoricalIndex(
|
|
["a", "b"]), range(5)],
|
|
labels=[[0] * 5, range(5)],
|
|
names=["Index1", "Index2"]))
|
|
result = g.get_group('a')
|
|
|
|
assert_frame_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.xfail(PY37, reason="flaky on 3.7, xref gh-21636")
|
|
@pytest.mark.parametrize('ordered', [True, False])
|
|
def test_apply(ordered):
|
|
# GH 10138
|
|
|
|
dense = Categorical(list('abc'), ordered=ordered)
|
|
|
|
# 'b' is in the categories but not in the list
|
|
missing = Categorical(
|
|
list('aaa'), categories=['a', 'b'], ordered=ordered)
|
|
values = np.arange(len(dense))
|
|
df = DataFrame({'missing': missing,
|
|
'dense': dense,
|
|
'values': values})
|
|
grouped = df.groupby(['missing', 'dense'], observed=True)
|
|
|
|
# missing category 'b' should still exist in the output index
|
|
idx = MultiIndex.from_arrays(
|
|
[missing, dense], names=['missing', 'dense'])
|
|
expected = DataFrame([0, 1, 2.],
|
|
index=idx,
|
|
columns=['values'])
|
|
|
|
result = grouped.apply(lambda x: np.mean(x))
|
|
assert_frame_equal(result, expected)
|
|
|
|
# we coerce back to ints
|
|
expected = expected.astype('int')
|
|
result = grouped.mean()
|
|
assert_frame_equal(result, expected)
|
|
|
|
result = grouped.agg(np.mean)
|
|
assert_frame_equal(result, expected)
|
|
|
|
# but for transform we should still get back the original index
|
|
idx = MultiIndex.from_arrays([missing, dense],
|
|
names=['missing', 'dense'])
|
|
expected = Series(1, index=idx)
|
|
result = grouped.apply(lambda x: 1)
|
|
assert_series_equal(result, expected)
|
|
|
|
|
|
def test_observed(observed):
|
|
# multiple groupers, don't re-expand the output space
|
|
# of the grouper
|
|
# gh-14942 (implement)
|
|
# gh-10132 (back-compat)
|
|
# gh-8138 (back-compat)
|
|
# gh-8869
|
|
|
|
cat1 = Categorical(["a", "a", "b", "b"],
|
|
categories=["a", "b", "z"], ordered=True)
|
|
cat2 = Categorical(["c", "d", "c", "d"],
|
|
categories=["c", "d", "y"], ordered=True)
|
|
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
|
|
df['C'] = ['foo', 'bar'] * 2
|
|
|
|
# multiple groupers with a non-cat
|
|
gb = df.groupby(['A', 'B', 'C'], observed=observed)
|
|
exp_index = pd.MultiIndex.from_arrays(
|
|
[cat1, cat2, ['foo', 'bar'] * 2],
|
|
names=['A', 'B', 'C'])
|
|
expected = DataFrame({'values': Series(
|
|
[1, 2, 3, 4], index=exp_index)}).sort_index()
|
|
result = gb.sum()
|
|
if not observed:
|
|
expected = cartesian_product_for_groupers(
|
|
expected,
|
|
[cat1, cat2, ['foo', 'bar']],
|
|
list('ABC'))
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
gb = df.groupby(['A', 'B'], observed=observed)
|
|
exp_index = pd.MultiIndex.from_arrays(
|
|
[cat1, cat2],
|
|
names=['A', 'B'])
|
|
expected = DataFrame({'values': [1, 2, 3, 4]},
|
|
index=exp_index)
|
|
result = gb.sum()
|
|
if not observed:
|
|
expected = cartesian_product_for_groupers(
|
|
expected,
|
|
[cat1, cat2],
|
|
list('AB'))
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# https://github.com/pandas-dev/pandas/issues/8138
|
|
d = {'cat':
|
|
pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"],
|
|
ordered=True),
|
|
'ints': [1, 1, 2, 2],
|
|
'val': [10, 20, 30, 40]}
|
|
df = pd.DataFrame(d)
|
|
|
|
# Grouping on a single column
|
|
groups_single_key = df.groupby("cat", observed=observed)
|
|
result = groups_single_key.mean()
|
|
|
|
exp_index = pd.CategoricalIndex(list('ab'), name="cat",
|
|
categories=list('abc'),
|
|
ordered=True)
|
|
expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]},
|
|
index=exp_index)
|
|
if not observed:
|
|
index = pd.CategoricalIndex(list('abc'), name="cat",
|
|
categories=list('abc'),
|
|
ordered=True)
|
|
expected = expected.reindex(index)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Grouping on two columns
|
|
groups_double_key = df.groupby(["cat", "ints"], observed=observed)
|
|
result = groups_double_key.agg('mean')
|
|
expected = DataFrame(
|
|
{"val": [10, 30, 20, 40],
|
|
"cat": pd.Categorical(['a', 'a', 'b', 'b'],
|
|
categories=['a', 'b', 'c'],
|
|
ordered=True),
|
|
"ints": [1, 2, 1, 2]}).set_index(["cat", "ints"])
|
|
if not observed:
|
|
expected = cartesian_product_for_groupers(
|
|
expected,
|
|
[df.cat.values, [1, 2]],
|
|
['cat', 'ints'])
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# GH 10132
|
|
for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
|
|
c, i = key
|
|
result = groups_double_key.get_group(key)
|
|
expected = df[(df.cat == c) & (df.ints == i)]
|
|
assert_frame_equal(result, expected)
|
|
|
|
# gh-8869
|
|
# with as_index
|
|
d = {'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70],
|
|
'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']}
|
|
df = pd.DataFrame(d)
|
|
cat = pd.cut(df['foo'], np.linspace(0, 10, 3))
|
|
df['range'] = cat
|
|
groups = df.groupby(['range', 'baz'], as_index=False, observed=observed)
|
|
result = groups.agg('mean')
|
|
|
|
groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed)
|
|
expected = groups2.agg('mean').reset_index()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_observed_codes_remap(observed):
|
|
d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
|
|
df = pd.DataFrame(d)
|
|
values = pd.cut(df['C1'], [1, 2, 3, 6])
|
|
values.name = "cat"
|
|
groups_double_key = df.groupby([values, 'C2'], observed=observed)
|
|
|
|
idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]],
|
|
names=["cat", "C2"])
|
|
expected = DataFrame({"C1": [3, 3, 4, 5],
|
|
"C3": [10, 100, 200, 34]}, index=idx)
|
|
if not observed:
|
|
expected = cartesian_product_for_groupers(
|
|
expected,
|
|
[values.values, [1, 2, 3, 4]],
|
|
['cat', 'C2'])
|
|
|
|
result = groups_double_key.agg('mean')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_observed_perf():
|
|
# we create a cartesian product, so this is
|
|
# non-performant if we don't use observed values
|
|
# gh-14942
|
|
df = DataFrame({
|
|
'cat': np.random.randint(0, 255, size=30000),
|
|
'int_id': np.random.randint(0, 255, size=30000),
|
|
'other_id': np.random.randint(0, 10000, size=30000),
|
|
'foo': 0})
|
|
df['cat'] = df.cat.astype(str).astype('category')
|
|
|
|
grouped = df.groupby(['cat', 'int_id', 'other_id'], observed=True)
|
|
result = grouped.count()
|
|
assert result.index.levels[0].nunique() == df.cat.nunique()
|
|
assert result.index.levels[1].nunique() == df.int_id.nunique()
|
|
assert result.index.levels[2].nunique() == df.other_id.nunique()
|
|
|
|
|
|
def test_observed_groups(observed):
|
|
# gh-20583
|
|
# test that we have the appropriate groups
|
|
|
|
cat = pd.Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c'])
|
|
df = pd.DataFrame({'cat': cat, 'vals': [1, 2, 3]})
|
|
g = df.groupby('cat', observed=observed)
|
|
|
|
result = g.groups
|
|
if observed:
|
|
expected = {'a': Index([0, 2], dtype='int64'),
|
|
'c': Index([1], dtype='int64')}
|
|
else:
|
|
expected = {'a': Index([0, 2], dtype='int64'),
|
|
'b': Index([], dtype='int64'),
|
|
'c': Index([1], dtype='int64')}
|
|
|
|
tm.assert_dict_equal(result, expected)
|
|
|
|
|
|
def test_datetime():
|
|
# GH9049: ensure backward compatibility
|
|
levels = pd.date_range('2014-01-01', periods=4)
|
|
codes = np.random.randint(0, 4, size=100)
|
|
|
|
cats = Categorical.from_codes(codes, levels, ordered=True)
|
|
|
|
data = DataFrame(np.random.randn(100, 4))
|
|
result = data.groupby(cats, observed=False).mean()
|
|
|
|
expected = data.groupby(np.asarray(cats), observed=False).mean()
|
|
expected = expected.reindex(levels)
|
|
expected.index = CategoricalIndex(expected.index,
|
|
categories=expected.index,
|
|
ordered=True)
|
|
|
|
assert_frame_equal(result, expected)
|
|
|
|
grouped = data.groupby(cats, observed=False)
|
|
desc_result = grouped.describe()
|
|
|
|
idx = cats.codes.argsort()
|
|
ord_labels = cats.take_nd(idx)
|
|
ord_data = data.take(idx)
|
|
expected = ord_data.groupby(ord_labels, observed=False).describe()
|
|
assert_frame_equal(desc_result, expected)
|
|
tm.assert_index_equal(desc_result.index, expected.index)
|
|
tm.assert_index_equal(
|
|
desc_result.index.get_level_values(0),
|
|
expected.index.get_level_values(0))
|
|
|
|
# GH 10460
|
|
expc = Categorical.from_codes(
|
|
np.arange(4).repeat(8), levels, ordered=True)
|
|
exp = CategoricalIndex(expc)
|
|
tm.assert_index_equal((desc_result.stack().index
|
|
.get_level_values(0)), exp)
|
|
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
|
|
'75%', 'max'] * 4)
|
|
tm.assert_index_equal((desc_result.stack().index
|
|
.get_level_values(1)), exp)
|
|
|
|
|
|
def test_categorical_index():
|
|
|
|
s = np.random.RandomState(12345)
|
|
levels = ['foo', 'bar', 'baz', 'qux']
|
|
codes = s.randint(0, 4, size=20)
|
|
cats = Categorical.from_codes(codes, levels, ordered=True)
|
|
df = DataFrame(
|
|
np.repeat(
|
|
np.arange(20), 4).reshape(-1, 4), columns=list('abcd'))
|
|
df['cats'] = cats
|
|
|
|
# with a cat index
|
|
result = df.set_index('cats').groupby(level=0, observed=False).sum()
|
|
expected = df[list('abcd')].groupby(cats.codes, observed=False).sum()
|
|
expected.index = CategoricalIndex(
|
|
Categorical.from_codes(
|
|
[0, 1, 2, 3], levels, ordered=True), name='cats')
|
|
assert_frame_equal(result, expected)
|
|
|
|
# with a cat column, should produce a cat index
|
|
result = df.groupby('cats', observed=False).sum()
|
|
expected = df[list('abcd')].groupby(cats.codes, observed=False).sum()
|
|
expected.index = CategoricalIndex(
|
|
Categorical.from_codes(
|
|
[0, 1, 2, 3], levels, ordered=True), name='cats')
|
|
assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_describe_categorical_columns():
|
|
# GH 11558
|
|
cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'],
|
|
categories=['foo', 'bar', 'baz', 'qux'],
|
|
ordered=True)
|
|
df = DataFrame(np.random.randn(20, 4), columns=cats)
|
|
result = df.groupby([1, 2, 3, 4] * 5).describe()
|
|
|
|
tm.assert_index_equal(result.stack().columns, cats)
|
|
tm.assert_categorical_equal(result.stack().columns.values, cats.values)
|
|
|
|
|
|
def test_unstack_categorical():
|
|
# GH11558 (example is taken from the original issue)
|
|
df = pd.DataFrame({'a': range(10),
|
|
'medium': ['A', 'B'] * 5,
|
|
'artist': list('XYXXY') * 2})
|
|
df['medium'] = df['medium'].astype('category')
|
|
|
|
gcat = df.groupby(
|
|
['artist', 'medium'], observed=False)['a'].count().unstack()
|
|
result = gcat.describe()
|
|
|
|
exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False,
|
|
name='medium')
|
|
tm.assert_index_equal(result.columns, exp_columns)
|
|
tm.assert_categorical_equal(result.columns.values, exp_columns.values)
|
|
|
|
result = gcat['A'] + gcat['B']
|
|
expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist'))
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_bins_unequal_len():
|
|
# GH3011
|
|
series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
|
|
bins = pd.cut(series.dropna().values, 4)
|
|
|
|
# len(bins) != len(series) here
|
|
def f():
|
|
series.groupby(bins).mean()
|
|
pytest.raises(ValueError, f)
|
|
|
|
|
|
def test_as_index():
|
|
# GH13204
|
|
df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]),
|
|
'A': [10, 11, 11],
|
|
'B': [101, 102, 103]})
|
|
result = df.groupby(['cat', 'A'], as_index=False, observed=True).sum()
|
|
expected = DataFrame(
|
|
{'cat': Categorical([1, 2], categories=df.cat.cat.categories),
|
|
'A': [10, 11],
|
|
'B': [101, 205]},
|
|
columns=['cat', 'A', 'B'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# function grouper
|
|
f = lambda r: df.loc[r, 'A']
|
|
result = df.groupby(['cat', f], as_index=False, observed=True).sum()
|
|
expected = DataFrame(
|
|
{'cat': Categorical([1, 2], categories=df.cat.cat.categories),
|
|
'A': [10, 22],
|
|
'B': [101, 205]},
|
|
columns=['cat', 'A', 'B'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# another not in-axis grouper (conflicting names in index)
|
|
s = Series(['a', 'b', 'b'], name='cat')
|
|
result = df.groupby(['cat', s], as_index=False, observed=True).sum()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# is original index dropped?
|
|
group_columns = ['cat', 'A']
|
|
expected = DataFrame(
|
|
{'cat': Categorical([1, 2], categories=df.cat.cat.categories),
|
|
'A': [10, 11],
|
|
'B': [101, 205]},
|
|
columns=['cat', 'A', 'B'])
|
|
|
|
for name in [None, 'X', 'B', 'cat']:
|
|
df.index = Index(list("abc"), name=name)
|
|
|
|
if name in group_columns and name in df.index.names:
|
|
with tm.assert_produces_warning(FutureWarning,
|
|
check_stacklevel=False):
|
|
result = df.groupby(
|
|
group_columns, as_index=False, observed=True).sum()
|
|
|
|
else:
|
|
result = df.groupby(
|
|
group_columns, as_index=False, observed=True).sum()
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_preserve_categories():
|
|
# GH-13179
|
|
categories = list('abc')
|
|
|
|
# ordered=True
|
|
df = DataFrame({'A': pd.Categorical(list('ba'),
|
|
categories=categories,
|
|
ordered=True)})
|
|
index = pd.CategoricalIndex(categories, categories, ordered=True)
|
|
tm.assert_index_equal(
|
|
df.groupby('A', sort=True, observed=False).first().index, index)
|
|
tm.assert_index_equal(
|
|
df.groupby('A', sort=False, observed=False).first().index, index)
|
|
|
|
# ordered=False
|
|
df = DataFrame({'A': pd.Categorical(list('ba'),
|
|
categories=categories,
|
|
ordered=False)})
|
|
sort_index = pd.CategoricalIndex(categories, categories, ordered=False)
|
|
nosort_index = pd.CategoricalIndex(list('bac'), list('bac'),
|
|
ordered=False)
|
|
tm.assert_index_equal(
|
|
df.groupby('A', sort=True, observed=False).first().index,
|
|
sort_index)
|
|
tm.assert_index_equal(
|
|
df.groupby('A', sort=False, observed=False).first().index,
|
|
nosort_index)
|
|
|
|
|
|
def test_preserve_categorical_dtype():
|
|
# GH13743, GH13854
|
|
df = DataFrame({'A': [1, 2, 1, 1, 2],
|
|
'B': [10, 16, 22, 28, 34],
|
|
'C1': Categorical(list("abaab"),
|
|
categories=list("bac"),
|
|
ordered=False),
|
|
'C2': Categorical(list("abaab"),
|
|
categories=list("bac"),
|
|
ordered=True)})
|
|
# single grouper
|
|
exp_full = DataFrame({'A': [2.0, 1.0, np.nan],
|
|
'B': [25.0, 20.0, np.nan],
|
|
'C1': Categorical(list("bac"),
|
|
categories=list("bac"),
|
|
ordered=False),
|
|
'C2': Categorical(list("bac"),
|
|
categories=list("bac"),
|
|
ordered=True)})
|
|
for col in ['C1', 'C2']:
|
|
result1 = df.groupby(by=col, as_index=False, observed=False).mean()
|
|
result2 = df.groupby(
|
|
by=col, as_index=True, observed=False).mean().reset_index()
|
|
expected = exp_full.reindex(columns=result1.columns)
|
|
tm.assert_frame_equal(result1, expected)
|
|
tm.assert_frame_equal(result2, expected)
|
|
|
|
|
|
def test_categorical_no_compress():
|
|
data = Series(np.random.randn(9))
|
|
|
|
codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
|
|
cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)
|
|
|
|
result = data.groupby(cats, observed=False).mean()
|
|
exp = data.groupby(codes, observed=False).mean()
|
|
|
|
exp.index = CategoricalIndex(exp.index, categories=cats.categories,
|
|
ordered=cats.ordered)
|
|
assert_series_equal(result, exp)
|
|
|
|
codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
|
|
cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)
|
|
|
|
result = data.groupby(cats, observed=False).mean()
|
|
exp = data.groupby(codes, observed=False).mean().reindex(cats.categories)
|
|
exp.index = CategoricalIndex(exp.index, categories=cats.categories,
|
|
ordered=cats.ordered)
|
|
assert_series_equal(result, exp)
|
|
|
|
cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
|
|
categories=["a", "b", "c", "d"], ordered=True)
|
|
data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
|
|
|
|
result = data.groupby("b", observed=False).mean()
|
|
result = result["a"].values
|
|
exp = np.array([1, 2, 4, np.nan])
|
|
tm.assert_numpy_array_equal(result, exp)
|
|
|
|
|
|
def test_sort():
|
|
|
|
# http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby # noqa: flake8
|
|
# This should result in a properly sorted Series so that the plot
|
|
# has a sorted x axis
|
|
# self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
|
|
|
|
df = DataFrame({'value': np.random.randint(0, 10000, 100)})
|
|
labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
|
|
cat_labels = Categorical(labels, labels)
|
|
|
|
df = df.sort_values(by=['value'], ascending=True)
|
|
df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
|
|
right=False, labels=cat_labels)
|
|
|
|
res = df.groupby(['value_group'], observed=False)['value_group'].count()
|
|
exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
|
|
exp.index = CategoricalIndex(exp.index, name=exp.index.name)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
|
|
def test_sort2():
|
|
# dataframe groupby sort was being ignored # GH 8868
|
|
df = DataFrame([['(7.5, 10]', 10, 10],
|
|
['(7.5, 10]', 8, 20],
|
|
['(2.5, 5]', 5, 30],
|
|
['(5, 7.5]', 6, 40],
|
|
['(2.5, 5]', 4, 50],
|
|
['(0, 2.5]', 1, 60],
|
|
['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar'])
|
|
df['range'] = Categorical(df['range'], ordered=True)
|
|
index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
|
|
'(7.5, 10]'], name='range', ordered=True)
|
|
expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
|
|
columns=['foo', 'bar'], index=index)
|
|
|
|
col = 'range'
|
|
result_sort = df.groupby(col, sort=True, observed=False).first()
|
|
assert_frame_equal(result_sort, expected_sort)
|
|
|
|
# when categories is ordered, group is ordered by category's order
|
|
expected_sort = result_sort
|
|
result_sort = df.groupby(col, sort=False, observed=False).first()
|
|
assert_frame_equal(result_sort, expected_sort)
|
|
|
|
df['range'] = Categorical(df['range'], ordered=False)
|
|
index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
|
|
'(7.5, 10]'], name='range')
|
|
expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
|
|
columns=['foo', 'bar'], index=index)
|
|
|
|
index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]',
|
|
'(0, 2.5]'],
|
|
categories=['(7.5, 10]', '(2.5, 5]',
|
|
'(5, 7.5]', '(0, 2.5]'],
|
|
name='range')
|
|
expected_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
|
|
index=index, columns=['foo', 'bar'])
|
|
|
|
col = 'range'
|
|
|
|
# this is an unordered categorical, but we allow this ####
|
|
result_sort = df.groupby(col, sort=True, observed=False).first()
|
|
assert_frame_equal(result_sort, expected_sort)
|
|
|
|
result_nosort = df.groupby(col, sort=False, observed=False).first()
|
|
assert_frame_equal(result_nosort, expected_nosort)
|
|
|
|
|
|
def test_sort_datetimelike():
|
|
# GH10505
|
|
|
|
# use same data as test_groupby_sort_categorical, which category is
|
|
# corresponding to datetime.month
|
|
df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1),
|
|
datetime(2011, 2, 1), datetime(2011, 5, 1),
|
|
datetime(2011, 2, 1), datetime(2011, 1, 1),
|
|
datetime(2011, 5, 1)],
|
|
'foo': [10, 8, 5, 6, 4, 1, 7],
|
|
'bar': [10, 20, 30, 40, 50, 60, 70]},
|
|
columns=['dt', 'foo', 'bar'])
|
|
|
|
# ordered=True
|
|
df['dt'] = Categorical(df['dt'], ordered=True)
|
|
index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
|
|
datetime(2011, 5, 1), datetime(2011, 7, 1)]
|
|
result_sort = DataFrame(
|
|
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
|
|
result_sort.index = CategoricalIndex(index, name='dt', ordered=True)
|
|
|
|
index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
|
|
datetime(2011, 5, 1), datetime(2011, 1, 1)]
|
|
result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
|
|
columns=['foo', 'bar'])
|
|
result_nosort.index = CategoricalIndex(index, categories=index,
|
|
name='dt', ordered=True)
|
|
|
|
col = 'dt'
|
|
assert_frame_equal(
|
|
result_sort, df.groupby(col, sort=True, observed=False).first())
|
|
|
|
# when categories is ordered, group is ordered by category's order
|
|
assert_frame_equal(
|
|
result_sort, df.groupby(col, sort=False, observed=False).first())
|
|
|
|
# ordered = False
|
|
df['dt'] = Categorical(df['dt'], ordered=False)
|
|
index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
|
|
datetime(2011, 5, 1), datetime(2011, 7, 1)]
|
|
result_sort = DataFrame(
|
|
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
|
|
result_sort.index = CategoricalIndex(index, name='dt')
|
|
|
|
index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
|
|
datetime(2011, 5, 1), datetime(2011, 1, 1)]
|
|
result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
|
|
columns=['foo', 'bar'])
|
|
result_nosort.index = CategoricalIndex(index, categories=index,
|
|
name='dt')
|
|
|
|
col = 'dt'
|
|
assert_frame_equal(
|
|
result_sort, df.groupby(col, sort=True, observed=False).first())
|
|
assert_frame_equal(
|
|
result_nosort, df.groupby(col, sort=False, observed=False).first())
|
|
|
|
|
|
def test_empty_sum():
|
|
# https://github.com/pandas-dev/pandas/issues/18678
|
|
df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
|
|
categories=['a', 'b', 'c']),
|
|
'B': [1, 2, 1]})
|
|
expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A')
|
|
|
|
# 0 by default
|
|
result = df.groupby("A", observed=False).B.sum()
|
|
expected = pd.Series([3, 1, 0], expected_idx, name='B')
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count=0
|
|
result = df.groupby("A", observed=False).B.sum(min_count=0)
|
|
expected = pd.Series([3, 1, 0], expected_idx, name='B')
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count=1
|
|
result = df.groupby("A", observed=False).B.sum(min_count=1)
|
|
expected = pd.Series([3, 1, np.nan], expected_idx, name='B')
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count>1
|
|
result = df.groupby("A", observed=False).B.sum(min_count=2)
|
|
expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B')
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_empty_prod():
|
|
# https://github.com/pandas-dev/pandas/issues/18678
|
|
df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
|
|
categories=['a', 'b', 'c']),
|
|
'B': [1, 2, 1]})
|
|
|
|
expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A')
|
|
|
|
# 1 by default
|
|
result = df.groupby("A", observed=False).B.prod()
|
|
expected = pd.Series([2, 1, 1], expected_idx, name='B')
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count=0
|
|
result = df.groupby("A", observed=False).B.prod(min_count=0)
|
|
expected = pd.Series([2, 1, 1], expected_idx, name='B')
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count=1
|
|
result = df.groupby("A", observed=False).B.prod(min_count=1)
|
|
expected = pd.Series([2, 1, np.nan], expected_idx, name='B')
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_groupby_multiindex_categorical_datetime():
|
|
# https://github.com/pandas-dev/pandas/issues/21390
|
|
|
|
df = pd.DataFrame({
|
|
'key1': pd.Categorical(list('abcbabcba')),
|
|
'key2': pd.Categorical(
|
|
list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3),
|
|
'values': np.arange(9),
|
|
})
|
|
result = df.groupby(['key1', 'key2']).mean()
|
|
|
|
idx = pd.MultiIndex.from_product(
|
|
[pd.Categorical(['a', 'b', 'c']),
|
|
pd.Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))],
|
|
names=['key1', 'key2'])
|
|
expected = pd.DataFrame(
|
|
{'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx)
|
|
assert_frame_equal(result, expected)
|