laywerrobot/lib/python3.6/site-packages/pandas/tests/groupby/test_categorical.py
2020-08-27 21:55:39 +02:00

872 lines
32 KiB
Python

# -*- coding: utf-8 -*-
from __future__ import print_function
from datetime import datetime
import pytest
import numpy as np
import pandas as pd
from pandas.compat import PY37
from pandas import (Index, MultiIndex, CategoricalIndex,
DataFrame, Categorical, Series, qcut)
from pandas.util.testing import assert_frame_equal, assert_series_equal
import pandas.util.testing as tm
def cartesian_product_for_groupers(result, args, names):
""" Reindex to a cartesian production for the groupers,
preserving the nature (Categorical) of each grouper """
def f(a):
if isinstance(a, (CategoricalIndex, Categorical)):
categories = a.categories
a = Categorical.from_codes(np.arange(len(categories)),
categories=categories,
ordered=a.ordered)
return a
index = pd.MultiIndex.from_product(map(f, args), names=names)
return result.reindex(index).sort_index()
def test_apply_use_categorical_name(df):
cats = qcut(df.C, 4)
def get_stats(group):
return {'min': group.min(),
'max': group.max(),
'count': group.count(),
'mean': group.mean()}
result = df.groupby(cats, observed=False).D.apply(get_stats)
assert result.index.names[0] == 'C'
def test_basic():
cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
categories=["a", "b", "c", "d"], ordered=True)
data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
result = data.groupby("b", observed=False).mean()
tm.assert_frame_equal(result, expected)
cat1 = Categorical(["a", "a", "b", "b"],
categories=["a", "b", "z"], ordered=True)
cat2 = Categorical(["c", "d", "c", "d"],
categories=["c", "d", "y"], ordered=True)
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
# single grouper
gb = df.groupby("A", observed=False)
exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
result = gb.sum()
tm.assert_frame_equal(result, expected)
# GH 8623
x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
[1, 'John P. Doe']],
columns=['person_id', 'person_name'])
x['person_name'] = Categorical(x.person_name)
g = x.groupby(['person_id'], observed=False)
result = g.transform(lambda x: x)
tm.assert_frame_equal(result, x[['person_name']])
result = x.drop_duplicates('person_name')
expected = x.iloc[[0, 1]]
tm.assert_frame_equal(result, expected)
def f(x):
return x.drop_duplicates('person_name').iloc[0]
result = g.apply(f)
expected = x.iloc[[0, 1]].copy()
expected.index = Index([1, 2], name='person_id')
expected['person_name'] = expected['person_name'].astype('object')
tm.assert_frame_equal(result, expected)
# GH 9921
# Monotonic
df = DataFrame({"a": [5, 15, 25]})
c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
result = df.a.groupby(c, observed=False).transform(sum)
tm.assert_series_equal(result, df['a'])
tm.assert_series_equal(
df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
df['a'])
tm.assert_frame_equal(
df.groupby(c, observed=False).transform(sum),
df[['a']])
tm.assert_frame_equal(
df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
df[['a']])
# Filter
tm.assert_series_equal(
df.a.groupby(c, observed=False).filter(np.all),
df['a'])
tm.assert_frame_equal(
df.groupby(c, observed=False).filter(np.all),
df)
# Non-monotonic
df = DataFrame({"a": [5, 15, 25, -5]})
c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
result = df.a.groupby(c, observed=False).transform(sum)
tm.assert_series_equal(result, df['a'])
tm.assert_series_equal(
df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
df['a'])
tm.assert_frame_equal(
df.groupby(c, observed=False).transform(sum),
df[['a']])
tm.assert_frame_equal(
df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
df[['a']])
# GH 9603
df = DataFrame({'a': [1, 0, 0, 0]})
c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
result = df.groupby(c, observed=False).apply(len)
exp_index = CategoricalIndex(
c.values.categories, ordered=c.values.ordered)
expected = Series([1, 0, 0, 0], index=exp_index)
expected.index.name = 'a'
tm.assert_series_equal(result, expected)
# more basic
levels = ['foo', 'bar', 'baz', 'qux']
codes = np.random.randint(0, 4, size=100)
cats = Categorical.from_codes(codes, levels, ordered=True)
data = DataFrame(np.random.randn(100, 4))
result = data.groupby(cats, observed=False).mean()
expected = data.groupby(np.asarray(cats), observed=False).mean()
exp_idx = CategoricalIndex(levels, categories=cats.categories,
ordered=True)
expected = expected.reindex(exp_idx)
assert_frame_equal(result, expected)
grouped = data.groupby(cats, observed=False)
desc_result = grouped.describe()
idx = cats.codes.argsort()
ord_labels = np.asarray(cats).take(idx)
ord_data = data.take(idx)
exp_cats = Categorical(ord_labels, ordered=True,
categories=['foo', 'bar', 'baz', 'qux'])
expected = ord_data.groupby(
exp_cats, sort=False, observed=False).describe()
assert_frame_equal(desc_result, expected)
# GH 10460
expc = Categorical.from_codes(np.arange(4).repeat(8),
levels, ordered=True)
exp = CategoricalIndex(expc)
tm.assert_index_equal((desc_result.stack().index
.get_level_values(0)), exp)
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
'75%', 'max'] * 4)
tm.assert_index_equal((desc_result.stack().index
.get_level_values(1)), exp)
def test_level_get_group(observed):
# GH15155
df = DataFrame(data=np.arange(2, 22, 2),
index=MultiIndex(
levels=[pd.CategoricalIndex(["a", "b"]), range(10)],
labels=[[0] * 5 + [1] * 5, range(10)],
names=["Index1", "Index2"]))
g = df.groupby(level=["Index1"], observed=observed)
# expected should equal test.loc[["a"]]
# GH15166
expected = DataFrame(data=np.arange(2, 12, 2),
index=pd.MultiIndex(levels=[pd.CategoricalIndex(
["a", "b"]), range(5)],
labels=[[0] * 5, range(5)],
names=["Index1", "Index2"]))
result = g.get_group('a')
assert_frame_equal(result, expected)
@pytest.mark.xfail(PY37, reason="flaky on 3.7, xref gh-21636")
@pytest.mark.parametrize('ordered', [True, False])
def test_apply(ordered):
# GH 10138
dense = Categorical(list('abc'), ordered=ordered)
# 'b' is in the categories but not in the list
missing = Categorical(
list('aaa'), categories=['a', 'b'], ordered=ordered)
values = np.arange(len(dense))
df = DataFrame({'missing': missing,
'dense': dense,
'values': values})
grouped = df.groupby(['missing', 'dense'], observed=True)
# missing category 'b' should still exist in the output index
idx = MultiIndex.from_arrays(
[missing, dense], names=['missing', 'dense'])
expected = DataFrame([0, 1, 2.],
index=idx,
columns=['values'])
result = grouped.apply(lambda x: np.mean(x))
assert_frame_equal(result, expected)
# we coerce back to ints
expected = expected.astype('int')
result = grouped.mean()
assert_frame_equal(result, expected)
result = grouped.agg(np.mean)
assert_frame_equal(result, expected)
# but for transform we should still get back the original index
idx = MultiIndex.from_arrays([missing, dense],
names=['missing', 'dense'])
expected = Series(1, index=idx)
result = grouped.apply(lambda x: 1)
assert_series_equal(result, expected)
def test_observed(observed):
# multiple groupers, don't re-expand the output space
# of the grouper
# gh-14942 (implement)
# gh-10132 (back-compat)
# gh-8138 (back-compat)
# gh-8869
cat1 = Categorical(["a", "a", "b", "b"],
categories=["a", "b", "z"], ordered=True)
cat2 = Categorical(["c", "d", "c", "d"],
categories=["c", "d", "y"], ordered=True)
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
df['C'] = ['foo', 'bar'] * 2
# multiple groupers with a non-cat
gb = df.groupby(['A', 'B', 'C'], observed=observed)
exp_index = pd.MultiIndex.from_arrays(
[cat1, cat2, ['foo', 'bar'] * 2],
names=['A', 'B', 'C'])
expected = DataFrame({'values': Series(
[1, 2, 3, 4], index=exp_index)}).sort_index()
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(
expected,
[cat1, cat2, ['foo', 'bar']],
list('ABC'))
tm.assert_frame_equal(result, expected)
gb = df.groupby(['A', 'B'], observed=observed)
exp_index = pd.MultiIndex.from_arrays(
[cat1, cat2],
names=['A', 'B'])
expected = DataFrame({'values': [1, 2, 3, 4]},
index=exp_index)
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(
expected,
[cat1, cat2],
list('AB'))
tm.assert_frame_equal(result, expected)
# https://github.com/pandas-dev/pandas/issues/8138
d = {'cat':
pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"],
ordered=True),
'ints': [1, 1, 2, 2],
'val': [10, 20, 30, 40]}
df = pd.DataFrame(d)
# Grouping on a single column
groups_single_key = df.groupby("cat", observed=observed)
result = groups_single_key.mean()
exp_index = pd.CategoricalIndex(list('ab'), name="cat",
categories=list('abc'),
ordered=True)
expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]},
index=exp_index)
if not observed:
index = pd.CategoricalIndex(list('abc'), name="cat",
categories=list('abc'),
ordered=True)
expected = expected.reindex(index)
tm.assert_frame_equal(result, expected)
# Grouping on two columns
groups_double_key = df.groupby(["cat", "ints"], observed=observed)
result = groups_double_key.agg('mean')
expected = DataFrame(
{"val": [10, 30, 20, 40],
"cat": pd.Categorical(['a', 'a', 'b', 'b'],
categories=['a', 'b', 'c'],
ordered=True),
"ints": [1, 2, 1, 2]}).set_index(["cat", "ints"])
if not observed:
expected = cartesian_product_for_groupers(
expected,
[df.cat.values, [1, 2]],
['cat', 'ints'])
tm.assert_frame_equal(result, expected)
# GH 10132
for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
c, i = key
result = groups_double_key.get_group(key)
expected = df[(df.cat == c) & (df.ints == i)]
assert_frame_equal(result, expected)
# gh-8869
# with as_index
d = {'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70],
'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']}
df = pd.DataFrame(d)
cat = pd.cut(df['foo'], np.linspace(0, 10, 3))
df['range'] = cat
groups = df.groupby(['range', 'baz'], as_index=False, observed=observed)
result = groups.agg('mean')
groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed)
expected = groups2.agg('mean').reset_index()
tm.assert_frame_equal(result, expected)
def test_observed_codes_remap(observed):
d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]}
df = pd.DataFrame(d)
values = pd.cut(df['C1'], [1, 2, 3, 6])
values.name = "cat"
groups_double_key = df.groupby([values, 'C2'], observed=observed)
idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]],
names=["cat", "C2"])
expected = DataFrame({"C1": [3, 3, 4, 5],
"C3": [10, 100, 200, 34]}, index=idx)
if not observed:
expected = cartesian_product_for_groupers(
expected,
[values.values, [1, 2, 3, 4]],
['cat', 'C2'])
result = groups_double_key.agg('mean')
tm.assert_frame_equal(result, expected)
def test_observed_perf():
# we create a cartesian product, so this is
# non-performant if we don't use observed values
# gh-14942
df = DataFrame({
'cat': np.random.randint(0, 255, size=30000),
'int_id': np.random.randint(0, 255, size=30000),
'other_id': np.random.randint(0, 10000, size=30000),
'foo': 0})
df['cat'] = df.cat.astype(str).astype('category')
grouped = df.groupby(['cat', 'int_id', 'other_id'], observed=True)
result = grouped.count()
assert result.index.levels[0].nunique() == df.cat.nunique()
assert result.index.levels[1].nunique() == df.int_id.nunique()
assert result.index.levels[2].nunique() == df.other_id.nunique()
def test_observed_groups(observed):
# gh-20583
# test that we have the appropriate groups
cat = pd.Categorical(['a', 'c', 'a'], categories=['a', 'b', 'c'])
df = pd.DataFrame({'cat': cat, 'vals': [1, 2, 3]})
g = df.groupby('cat', observed=observed)
result = g.groups
if observed:
expected = {'a': Index([0, 2], dtype='int64'),
'c': Index([1], dtype='int64')}
else:
expected = {'a': Index([0, 2], dtype='int64'),
'b': Index([], dtype='int64'),
'c': Index([1], dtype='int64')}
tm.assert_dict_equal(result, expected)
def test_datetime():
# GH9049: ensure backward compatibility
levels = pd.date_range('2014-01-01', periods=4)
codes = np.random.randint(0, 4, size=100)
cats = Categorical.from_codes(codes, levels, ordered=True)
data = DataFrame(np.random.randn(100, 4))
result = data.groupby(cats, observed=False).mean()
expected = data.groupby(np.asarray(cats), observed=False).mean()
expected = expected.reindex(levels)
expected.index = CategoricalIndex(expected.index,
categories=expected.index,
ordered=True)
assert_frame_equal(result, expected)
grouped = data.groupby(cats, observed=False)
desc_result = grouped.describe()
idx = cats.codes.argsort()
ord_labels = cats.take_nd(idx)
ord_data = data.take(idx)
expected = ord_data.groupby(ord_labels, observed=False).describe()
assert_frame_equal(desc_result, expected)
tm.assert_index_equal(desc_result.index, expected.index)
tm.assert_index_equal(
desc_result.index.get_level_values(0),
expected.index.get_level_values(0))
# GH 10460
expc = Categorical.from_codes(
np.arange(4).repeat(8), levels, ordered=True)
exp = CategoricalIndex(expc)
tm.assert_index_equal((desc_result.stack().index
.get_level_values(0)), exp)
exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
'75%', 'max'] * 4)
tm.assert_index_equal((desc_result.stack().index
.get_level_values(1)), exp)
def test_categorical_index():
s = np.random.RandomState(12345)
levels = ['foo', 'bar', 'baz', 'qux']
codes = s.randint(0, 4, size=20)
cats = Categorical.from_codes(codes, levels, ordered=True)
df = DataFrame(
np.repeat(
np.arange(20), 4).reshape(-1, 4), columns=list('abcd'))
df['cats'] = cats
# with a cat index
result = df.set_index('cats').groupby(level=0, observed=False).sum()
expected = df[list('abcd')].groupby(cats.codes, observed=False).sum()
expected.index = CategoricalIndex(
Categorical.from_codes(
[0, 1, 2, 3], levels, ordered=True), name='cats')
assert_frame_equal(result, expected)
# with a cat column, should produce a cat index
result = df.groupby('cats', observed=False).sum()
expected = df[list('abcd')].groupby(cats.codes, observed=False).sum()
expected.index = CategoricalIndex(
Categorical.from_codes(
[0, 1, 2, 3], levels, ordered=True), name='cats')
assert_frame_equal(result, expected)
def test_describe_categorical_columns():
# GH 11558
cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'],
categories=['foo', 'bar', 'baz', 'qux'],
ordered=True)
df = DataFrame(np.random.randn(20, 4), columns=cats)
result = df.groupby([1, 2, 3, 4] * 5).describe()
tm.assert_index_equal(result.stack().columns, cats)
tm.assert_categorical_equal(result.stack().columns.values, cats.values)
def test_unstack_categorical():
# GH11558 (example is taken from the original issue)
df = pd.DataFrame({'a': range(10),
'medium': ['A', 'B'] * 5,
'artist': list('XYXXY') * 2})
df['medium'] = df['medium'].astype('category')
gcat = df.groupby(
['artist', 'medium'], observed=False)['a'].count().unstack()
result = gcat.describe()
exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False,
name='medium')
tm.assert_index_equal(result.columns, exp_columns)
tm.assert_categorical_equal(result.columns.values, exp_columns.values)
result = gcat['A'] + gcat['B']
expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist'))
tm.assert_series_equal(result, expected)
def test_bins_unequal_len():
# GH3011
series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
bins = pd.cut(series.dropna().values, 4)
# len(bins) != len(series) here
def f():
series.groupby(bins).mean()
pytest.raises(ValueError, f)
def test_as_index():
# GH13204
df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]),
'A': [10, 11, 11],
'B': [101, 102, 103]})
result = df.groupby(['cat', 'A'], as_index=False, observed=True).sum()
expected = DataFrame(
{'cat': Categorical([1, 2], categories=df.cat.cat.categories),
'A': [10, 11],
'B': [101, 205]},
columns=['cat', 'A', 'B'])
tm.assert_frame_equal(result, expected)
# function grouper
f = lambda r: df.loc[r, 'A']
result = df.groupby(['cat', f], as_index=False, observed=True).sum()
expected = DataFrame(
{'cat': Categorical([1, 2], categories=df.cat.cat.categories),
'A': [10, 22],
'B': [101, 205]},
columns=['cat', 'A', 'B'])
tm.assert_frame_equal(result, expected)
# another not in-axis grouper (conflicting names in index)
s = Series(['a', 'b', 'b'], name='cat')
result = df.groupby(['cat', s], as_index=False, observed=True).sum()
tm.assert_frame_equal(result, expected)
# is original index dropped?
group_columns = ['cat', 'A']
expected = DataFrame(
{'cat': Categorical([1, 2], categories=df.cat.cat.categories),
'A': [10, 11],
'B': [101, 205]},
columns=['cat', 'A', 'B'])
for name in [None, 'X', 'B', 'cat']:
df.index = Index(list("abc"), name=name)
if name in group_columns and name in df.index.names:
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = df.groupby(
group_columns, as_index=False, observed=True).sum()
else:
result = df.groupby(
group_columns, as_index=False, observed=True).sum()
tm.assert_frame_equal(result, expected)
def test_preserve_categories():
# GH-13179
categories = list('abc')
# ordered=True
df = DataFrame({'A': pd.Categorical(list('ba'),
categories=categories,
ordered=True)})
index = pd.CategoricalIndex(categories, categories, ordered=True)
tm.assert_index_equal(
df.groupby('A', sort=True, observed=False).first().index, index)
tm.assert_index_equal(
df.groupby('A', sort=False, observed=False).first().index, index)
# ordered=False
df = DataFrame({'A': pd.Categorical(list('ba'),
categories=categories,
ordered=False)})
sort_index = pd.CategoricalIndex(categories, categories, ordered=False)
nosort_index = pd.CategoricalIndex(list('bac'), list('bac'),
ordered=False)
tm.assert_index_equal(
df.groupby('A', sort=True, observed=False).first().index,
sort_index)
tm.assert_index_equal(
df.groupby('A', sort=False, observed=False).first().index,
nosort_index)
def test_preserve_categorical_dtype():
# GH13743, GH13854
df = DataFrame({'A': [1, 2, 1, 1, 2],
'B': [10, 16, 22, 28, 34],
'C1': Categorical(list("abaab"),
categories=list("bac"),
ordered=False),
'C2': Categorical(list("abaab"),
categories=list("bac"),
ordered=True)})
# single grouper
exp_full = DataFrame({'A': [2.0, 1.0, np.nan],
'B': [25.0, 20.0, np.nan],
'C1': Categorical(list("bac"),
categories=list("bac"),
ordered=False),
'C2': Categorical(list("bac"),
categories=list("bac"),
ordered=True)})
for col in ['C1', 'C2']:
result1 = df.groupby(by=col, as_index=False, observed=False).mean()
result2 = df.groupby(
by=col, as_index=True, observed=False).mean().reset_index()
expected = exp_full.reindex(columns=result1.columns)
tm.assert_frame_equal(result1, expected)
tm.assert_frame_equal(result2, expected)
def test_categorical_no_compress():
data = Series(np.random.randn(9))
codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)
result = data.groupby(cats, observed=False).mean()
exp = data.groupby(codes, observed=False).mean()
exp.index = CategoricalIndex(exp.index, categories=cats.categories,
ordered=cats.ordered)
assert_series_equal(result, exp)
codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)
result = data.groupby(cats, observed=False).mean()
exp = data.groupby(codes, observed=False).mean().reindex(cats.categories)
exp.index = CategoricalIndex(exp.index, categories=cats.categories,
ordered=cats.ordered)
assert_series_equal(result, exp)
cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
categories=["a", "b", "c", "d"], ordered=True)
data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
result = data.groupby("b", observed=False).mean()
result = result["a"].values
exp = np.array([1, 2, 4, np.nan])
tm.assert_numpy_array_equal(result, exp)
def test_sort():
# http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby # noqa: flake8
# This should result in a properly sorted Series so that the plot
# has a sorted x axis
# self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
df = DataFrame({'value': np.random.randint(0, 10000, 100)})
labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
cat_labels = Categorical(labels, labels)
df = df.sort_values(by=['value'], ascending=True)
df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
right=False, labels=cat_labels)
res = df.groupby(['value_group'], observed=False)['value_group'].count()
exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
exp.index = CategoricalIndex(exp.index, name=exp.index.name)
tm.assert_series_equal(res, exp)
def test_sort2():
# dataframe groupby sort was being ignored # GH 8868
df = DataFrame([['(7.5, 10]', 10, 10],
['(7.5, 10]', 8, 20],
['(2.5, 5]', 5, 30],
['(5, 7.5]', 6, 40],
['(2.5, 5]', 4, 50],
['(0, 2.5]', 1, 60],
['(5, 7.5]', 7, 70]], columns=['range', 'foo', 'bar'])
df['range'] = Categorical(df['range'], ordered=True)
index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
'(7.5, 10]'], name='range', ordered=True)
expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
columns=['foo', 'bar'], index=index)
col = 'range'
result_sort = df.groupby(col, sort=True, observed=False).first()
assert_frame_equal(result_sort, expected_sort)
# when categories is ordered, group is ordered by category's order
expected_sort = result_sort
result_sort = df.groupby(col, sort=False, observed=False).first()
assert_frame_equal(result_sort, expected_sort)
df['range'] = Categorical(df['range'], ordered=False)
index = CategoricalIndex(['(0, 2.5]', '(2.5, 5]', '(5, 7.5]',
'(7.5, 10]'], name='range')
expected_sort = DataFrame([[1, 60], [5, 30], [6, 40], [10, 10]],
columns=['foo', 'bar'], index=index)
index = CategoricalIndex(['(7.5, 10]', '(2.5, 5]', '(5, 7.5]',
'(0, 2.5]'],
categories=['(7.5, 10]', '(2.5, 5]',
'(5, 7.5]', '(0, 2.5]'],
name='range')
expected_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
index=index, columns=['foo', 'bar'])
col = 'range'
# this is an unordered categorical, but we allow this ####
result_sort = df.groupby(col, sort=True, observed=False).first()
assert_frame_equal(result_sort, expected_sort)
result_nosort = df.groupby(col, sort=False, observed=False).first()
assert_frame_equal(result_nosort, expected_nosort)
def test_sort_datetimelike():
# GH10505
# use same data as test_groupby_sort_categorical, which category is
# corresponding to datetime.month
df = DataFrame({'dt': [datetime(2011, 7, 1), datetime(2011, 7, 1),
datetime(2011, 2, 1), datetime(2011, 5, 1),
datetime(2011, 2, 1), datetime(2011, 1, 1),
datetime(2011, 5, 1)],
'foo': [10, 8, 5, 6, 4, 1, 7],
'bar': [10, 20, 30, 40, 50, 60, 70]},
columns=['dt', 'foo', 'bar'])
# ordered=True
df['dt'] = Categorical(df['dt'], ordered=True)
index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
datetime(2011, 5, 1), datetime(2011, 7, 1)]
result_sort = DataFrame(
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
result_sort.index = CategoricalIndex(index, name='dt', ordered=True)
index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
datetime(2011, 5, 1), datetime(2011, 1, 1)]
result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
columns=['foo', 'bar'])
result_nosort.index = CategoricalIndex(index, categories=index,
name='dt', ordered=True)
col = 'dt'
assert_frame_equal(
result_sort, df.groupby(col, sort=True, observed=False).first())
# when categories is ordered, group is ordered by category's order
assert_frame_equal(
result_sort, df.groupby(col, sort=False, observed=False).first())
# ordered = False
df['dt'] = Categorical(df['dt'], ordered=False)
index = [datetime(2011, 1, 1), datetime(2011, 2, 1),
datetime(2011, 5, 1), datetime(2011, 7, 1)]
result_sort = DataFrame(
[[1, 60], [5, 30], [6, 40], [10, 10]], columns=['foo', 'bar'])
result_sort.index = CategoricalIndex(index, name='dt')
index = [datetime(2011, 7, 1), datetime(2011, 2, 1),
datetime(2011, 5, 1), datetime(2011, 1, 1)]
result_nosort = DataFrame([[10, 10], [5, 30], [6, 40], [1, 60]],
columns=['foo', 'bar'])
result_nosort.index = CategoricalIndex(index, categories=index,
name='dt')
col = 'dt'
assert_frame_equal(
result_sort, df.groupby(col, sort=True, observed=False).first())
assert_frame_equal(
result_nosort, df.groupby(col, sort=False, observed=False).first())
def test_empty_sum():
# https://github.com/pandas-dev/pandas/issues/18678
df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
categories=['a', 'b', 'c']),
'B': [1, 2, 1]})
expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A')
# 0 by default
result = df.groupby("A", observed=False).B.sum()
expected = pd.Series([3, 1, 0], expected_idx, name='B')
tm.assert_series_equal(result, expected)
# min_count=0
result = df.groupby("A", observed=False).B.sum(min_count=0)
expected = pd.Series([3, 1, 0], expected_idx, name='B')
tm.assert_series_equal(result, expected)
# min_count=1
result = df.groupby("A", observed=False).B.sum(min_count=1)
expected = pd.Series([3, 1, np.nan], expected_idx, name='B')
tm.assert_series_equal(result, expected)
# min_count>1
result = df.groupby("A", observed=False).B.sum(min_count=2)
expected = pd.Series([3, np.nan, np.nan], expected_idx, name='B')
tm.assert_series_equal(result, expected)
def test_empty_prod():
# https://github.com/pandas-dev/pandas/issues/18678
df = pd.DataFrame({"A": pd.Categorical(['a', 'a', 'b'],
categories=['a', 'b', 'c']),
'B': [1, 2, 1]})
expected_idx = pd.CategoricalIndex(['a', 'b', 'c'], name='A')
# 1 by default
result = df.groupby("A", observed=False).B.prod()
expected = pd.Series([2, 1, 1], expected_idx, name='B')
tm.assert_series_equal(result, expected)
# min_count=0
result = df.groupby("A", observed=False).B.prod(min_count=0)
expected = pd.Series([2, 1, 1], expected_idx, name='B')
tm.assert_series_equal(result, expected)
# min_count=1
result = df.groupby("A", observed=False).B.prod(min_count=1)
expected = pd.Series([2, 1, np.nan], expected_idx, name='B')
tm.assert_series_equal(result, expected)
def test_groupby_multiindex_categorical_datetime():
# https://github.com/pandas-dev/pandas/issues/21390
df = pd.DataFrame({
'key1': pd.Categorical(list('abcbabcba')),
'key2': pd.Categorical(
list(pd.date_range('2018-06-01 00', freq='1T', periods=3)) * 3),
'values': np.arange(9),
})
result = df.groupby(['key1', 'key2']).mean()
idx = pd.MultiIndex.from_product(
[pd.Categorical(['a', 'b', 'c']),
pd.Categorical(pd.date_range('2018-06-01 00', freq='1T', periods=3))],
names=['key1', 'key2'])
expected = pd.DataFrame(
{'values': [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx)
assert_frame_equal(result, expected)