1764 lines
78 KiB
Python
1764 lines
78 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
from datetime import datetime, date, timedelta
|
|
|
|
import pytest
|
|
|
|
|
|
import numpy as np
|
|
|
|
from collections import OrderedDict
|
|
import pandas as pd
|
|
from pandas import (DataFrame, Series, Index, MultiIndex,
|
|
Grouper, date_range, concat, Categorical)
|
|
from pandas.core.reshape.pivot import pivot_table, crosstab
|
|
from pandas.compat import range, product
|
|
import pandas.util.testing as tm
|
|
from pandas.api.types import CategoricalDtype as CDT
|
|
|
|
|
|
@pytest.fixture(params=[True, False])
|
|
def dropna(request):
|
|
return request.param
|
|
|
|
|
|
class TestPivotTable(object):
|
|
|
|
def setup_method(self, method):
|
|
self.data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
|
|
'bar', 'bar', 'bar', 'bar',
|
|
'foo', 'foo', 'foo'],
|
|
'B': ['one', 'one', 'one', 'two',
|
|
'one', 'one', 'one', 'two',
|
|
'two', 'two', 'one'],
|
|
'C': ['dull', 'dull', 'shiny', 'dull',
|
|
'dull', 'shiny', 'shiny', 'dull',
|
|
'shiny', 'shiny', 'shiny'],
|
|
'D': np.random.randn(11),
|
|
'E': np.random.randn(11),
|
|
'F': np.random.randn(11)})
|
|
|
|
def test_pivot_table(self):
|
|
index = ['A', 'B']
|
|
columns = 'C'
|
|
table = pivot_table(self.data, values='D',
|
|
index=index, columns=columns)
|
|
|
|
table2 = self.data.pivot_table(
|
|
values='D', index=index, columns=columns)
|
|
tm.assert_frame_equal(table, table2)
|
|
|
|
# this works
|
|
pivot_table(self.data, values='D', index=index)
|
|
|
|
if len(index) > 1:
|
|
assert table.index.names == tuple(index)
|
|
else:
|
|
assert table.index.name == index[0]
|
|
|
|
if len(columns) > 1:
|
|
assert table.columns.names == columns
|
|
else:
|
|
assert table.columns.name == columns[0]
|
|
|
|
expected = self.data.groupby(
|
|
index + [columns])['D'].agg(np.mean).unstack()
|
|
tm.assert_frame_equal(table, expected)
|
|
|
|
def test_pivot_table_nocols(self):
|
|
df = DataFrame({'rows': ['a', 'b', 'c'],
|
|
'cols': ['x', 'y', 'z'],
|
|
'values': [1, 2, 3]})
|
|
rs = df.pivot_table(columns='cols', aggfunc=np.sum)
|
|
xp = df.pivot_table(index='cols', aggfunc=np.sum).T
|
|
tm.assert_frame_equal(rs, xp)
|
|
|
|
rs = df.pivot_table(columns='cols', aggfunc={'values': 'mean'})
|
|
xp = df.pivot_table(index='cols', aggfunc={'values': 'mean'}).T
|
|
tm.assert_frame_equal(rs, xp)
|
|
|
|
def test_pivot_table_dropna(self):
|
|
df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000},
|
|
'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'},
|
|
'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310},
|
|
'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'},
|
|
'quantity': {0: 2000000, 1: 500000,
|
|
2: 1000000, 3: 1000000}})
|
|
pv_col = df.pivot_table('quantity', 'month', [
|
|
'customer', 'product'], dropna=False)
|
|
pv_ind = df.pivot_table(
|
|
'quantity', ['customer', 'product'], 'month', dropna=False)
|
|
|
|
m = MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('A', 'c'),
|
|
('A', 'd'), ('B', 'a'), ('B', 'b'),
|
|
('B', 'c'), ('B', 'd'), ('C', 'a'),
|
|
('C', 'b'), ('C', 'c'), ('C', 'd')],
|
|
names=['customer', 'product'])
|
|
tm.assert_index_equal(pv_col.columns, m)
|
|
tm.assert_index_equal(pv_ind.index, m)
|
|
|
|
def test_pivot_table_categorical(self):
|
|
|
|
cat1 = Categorical(["a", "a", "b", "b"],
|
|
categories=["a", "b", "z"], ordered=True)
|
|
cat2 = Categorical(["c", "d", "c", "d"],
|
|
categories=["c", "d", "y"], ordered=True)
|
|
df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
|
|
result = pd.pivot_table(df, values='values', index=['A', 'B'],
|
|
dropna=True)
|
|
|
|
exp_index = pd.MultiIndex.from_arrays(
|
|
[cat1, cat2],
|
|
names=['A', 'B'])
|
|
expected = DataFrame(
|
|
{'values': [1, 2, 3, 4]},
|
|
index=exp_index)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_pivot_table_dropna_categoricals(self, dropna):
|
|
# GH 15193
|
|
categories = ['a', 'b', 'c', 'd']
|
|
|
|
df = DataFrame({'A': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'],
|
|
'B': [1, 2, 3, 1, 2, 3, 1, 2, 3],
|
|
'C': range(0, 9)})
|
|
|
|
df['A'] = df['A'].astype(CDT(categories, ordered=False))
|
|
result = df.pivot_table(index='B', columns='A', values='C',
|
|
dropna=dropna)
|
|
expected_columns = Series(['a', 'b', 'c'], name='A')
|
|
expected_columns = expected_columns.astype(
|
|
CDT(categories, ordered=False))
|
|
expected_index = Series([1, 2, 3], name='B')
|
|
expected = DataFrame([[0, 3, 6],
|
|
[1, 4, 7],
|
|
[2, 5, 8]],
|
|
index=expected_index,
|
|
columns=expected_columns,)
|
|
if not dropna:
|
|
# add back the non observed to compare
|
|
expected = expected.reindex(
|
|
columns=Categorical(categories)).astype('float')
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_pivot_with_non_observable_dropna(self, dropna):
|
|
# gh-21133
|
|
df = pd.DataFrame(
|
|
{'A': pd.Categorical([np.nan, 'low', 'high', 'low', 'high'],
|
|
categories=['low', 'high'],
|
|
ordered=True),
|
|
'B': range(5)})
|
|
|
|
result = df.pivot_table(index='A', values='B', dropna=dropna)
|
|
expected = pd.DataFrame(
|
|
{'B': [2, 3]},
|
|
index=pd.Index(
|
|
pd.Categorical.from_codes([0, 1],
|
|
categories=['low', 'high'],
|
|
ordered=True),
|
|
name='A'))
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_pass_array(self):
|
|
result = self.data.pivot_table(
|
|
'D', index=self.data.A, columns=self.data.C)
|
|
expected = self.data.pivot_table('D', index='A', columns='C')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_pass_function(self):
|
|
result = self.data.pivot_table('D', index=lambda x: x // 5,
|
|
columns=self.data.C)
|
|
expected = self.data.pivot_table('D', index=self.data.index // 5,
|
|
columns='C')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_pivot_table_multiple(self):
|
|
index = ['A', 'B']
|
|
columns = 'C'
|
|
table = pivot_table(self.data, index=index, columns=columns)
|
|
expected = self.data.groupby(index + [columns]).agg(np.mean).unstack()
|
|
tm.assert_frame_equal(table, expected)
|
|
|
|
def test_pivot_dtypes(self):
|
|
|
|
# can convert dtypes
|
|
f = DataFrame({'a': ['cat', 'bat', 'cat', 'bat'], 'v': [
|
|
1, 2, 3, 4], 'i': ['a', 'b', 'a', 'b']})
|
|
assert f.dtypes['v'] == 'int64'
|
|
|
|
z = pivot_table(f, values='v', index=['a'], columns=[
|
|
'i'], fill_value=0, aggfunc=np.sum)
|
|
result = z.get_dtype_counts()
|
|
expected = Series(dict(int64=2))
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# cannot convert dtypes
|
|
f = DataFrame({'a': ['cat', 'bat', 'cat', 'bat'], 'v': [
|
|
1.5, 2.5, 3.5, 4.5], 'i': ['a', 'b', 'a', 'b']})
|
|
assert f.dtypes['v'] == 'float64'
|
|
|
|
z = pivot_table(f, values='v', index=['a'], columns=[
|
|
'i'], fill_value=0, aggfunc=np.mean)
|
|
result = z.get_dtype_counts()
|
|
expected = Series(dict(float64=2))
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize('columns,values',
|
|
[('bool1', ['float1', 'float2']),
|
|
('bool1', ['float1', 'float2', 'bool1']),
|
|
('bool2', ['float1', 'float2', 'bool1'])])
|
|
def test_pivot_preserve_dtypes(self, columns, values):
|
|
# GH 7142 regression test
|
|
v = np.arange(5, dtype=np.float64)
|
|
df = DataFrame({'float1': v, 'float2': v + 2.0,
|
|
'bool1': v <= 2, 'bool2': v <= 3})
|
|
|
|
df_res = df.reset_index().pivot_table(
|
|
index='index', columns=columns, values=values)
|
|
|
|
result = dict(df_res.dtypes)
|
|
expected = {col: np.dtype('O') if col[0].startswith('b')
|
|
else np.dtype('float64') for col in df_res}
|
|
assert result == expected
|
|
|
|
def test_pivot_no_values(self):
|
|
# GH 14380
|
|
idx = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-01-02',
|
|
'2011-01-01', '2011-01-02'])
|
|
df = pd.DataFrame({'A': [1, 2, 3, 4, 5]},
|
|
index=idx)
|
|
res = df.pivot_table(index=df.index.month, columns=df.index.day)
|
|
|
|
exp_columns = pd.MultiIndex.from_tuples([('A', 1), ('A', 2)])
|
|
exp = pd.DataFrame([[2.5, 4.0], [2.0, np.nan]],
|
|
index=[1, 2], columns=exp_columns)
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
df = pd.DataFrame({'A': [1, 2, 3, 4, 5],
|
|
'dt': pd.date_range('2011-01-01', freq='D',
|
|
periods=5)},
|
|
index=idx)
|
|
res = df.pivot_table(index=df.index.month,
|
|
columns=pd.Grouper(key='dt', freq='M'))
|
|
exp_columns = pd.MultiIndex.from_tuples([('A',
|
|
pd.Timestamp('2011-01-31'))])
|
|
exp_columns.names = [None, 'dt']
|
|
exp = pd.DataFrame([3.25, 2.0],
|
|
index=[1, 2], columns=exp_columns)
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
res = df.pivot_table(index=pd.Grouper(freq='A'),
|
|
columns=pd.Grouper(key='dt', freq='M'))
|
|
exp = pd.DataFrame([3],
|
|
index=pd.DatetimeIndex(['2011-12-31']),
|
|
columns=exp_columns)
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
def test_pivot_multi_values(self):
|
|
result = pivot_table(self.data, values=['D', 'E'],
|
|
index='A', columns=['B', 'C'], fill_value=0)
|
|
expected = pivot_table(self.data.drop(['F'], axis=1),
|
|
index='A', columns=['B', 'C'], fill_value=0)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_pivot_multi_functions(self):
|
|
f = lambda func: pivot_table(self.data, values=['D', 'E'],
|
|
index=['A', 'B'], columns='C',
|
|
aggfunc=func)
|
|
result = f([np.mean, np.std])
|
|
means = f(np.mean)
|
|
stds = f(np.std)
|
|
expected = concat([means, stds], keys=['mean', 'std'], axis=1)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# margins not supported??
|
|
f = lambda func: pivot_table(self.data, values=['D', 'E'],
|
|
index=['A', 'B'], columns='C',
|
|
aggfunc=func, margins=True)
|
|
result = f([np.mean, np.std])
|
|
means = f(np.mean)
|
|
stds = f(np.std)
|
|
expected = concat([means, stds], keys=['mean', 'std'], axis=1)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_pivot_index_with_nan(self):
|
|
# GH 3588
|
|
nan = np.nan
|
|
df = DataFrame({'a': ['R1', 'R2', nan, 'R4'],
|
|
'b': ['C1', 'C2', 'C3', 'C4'],
|
|
'c': [10, 15, 17, 20]})
|
|
result = df.pivot('a', 'b', 'c')
|
|
expected = DataFrame([[nan, nan, 17, nan], [10, nan, nan, nan],
|
|
[nan, 15, nan, nan], [nan, nan, nan, 20]],
|
|
index=Index([nan, 'R1', 'R2', 'R4'], name='a'),
|
|
columns=Index(['C1', 'C2', 'C3', 'C4'], name='b'))
|
|
tm.assert_frame_equal(result, expected)
|
|
tm.assert_frame_equal(df.pivot('b', 'a', 'c'), expected.T)
|
|
|
|
# GH9491
|
|
df = DataFrame({'a': pd.date_range('2014-02-01', periods=6, freq='D'),
|
|
'c': 100 + np.arange(6)})
|
|
df['b'] = df['a'] - pd.Timestamp('2014-02-02')
|
|
df.loc[1, 'a'] = df.loc[3, 'a'] = nan
|
|
df.loc[1, 'b'] = df.loc[4, 'b'] = nan
|
|
|
|
pv = df.pivot('a', 'b', 'c')
|
|
assert pv.notna().values.sum() == len(df)
|
|
|
|
for _, row in df.iterrows():
|
|
assert pv.loc[row['a'], row['b']] == row['c']
|
|
|
|
tm.assert_frame_equal(df.pivot('b', 'a', 'c'), pv.T)
|
|
|
|
def test_pivot_with_tz(self):
|
|
# GH 5878
|
|
df = DataFrame({'dt1': [datetime(2013, 1, 1, 9, 0),
|
|
datetime(2013, 1, 2, 9, 0),
|
|
datetime(2013, 1, 1, 9, 0),
|
|
datetime(2013, 1, 2, 9, 0)],
|
|
'dt2': [datetime(2014, 1, 1, 9, 0),
|
|
datetime(2014, 1, 1, 9, 0),
|
|
datetime(2014, 1, 2, 9, 0),
|
|
datetime(2014, 1, 2, 9, 0)],
|
|
'data1': np.arange(4, dtype='int64'),
|
|
'data2': np.arange(4, dtype='int64')})
|
|
|
|
df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific'))
|
|
df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo'))
|
|
|
|
exp_col1 = Index(['data1', 'data1', 'data2', 'data2'])
|
|
exp_col2 = pd.DatetimeIndex(['2014/01/01 09:00',
|
|
'2014/01/02 09:00'] * 2,
|
|
name='dt2', tz='Asia/Tokyo')
|
|
exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2])
|
|
expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]],
|
|
index=pd.DatetimeIndex(['2013/01/01 09:00',
|
|
'2013/01/02 09:00'],
|
|
name='dt1',
|
|
tz='US/Pacific'),
|
|
columns=exp_col)
|
|
|
|
pv = df.pivot(index='dt1', columns='dt2')
|
|
tm.assert_frame_equal(pv, expected)
|
|
|
|
expected = DataFrame([[0, 2], [1, 3]],
|
|
index=pd.DatetimeIndex(['2013/01/01 09:00',
|
|
'2013/01/02 09:00'],
|
|
name='dt1',
|
|
tz='US/Pacific'),
|
|
columns=pd.DatetimeIndex(['2014/01/01 09:00',
|
|
'2014/01/02 09:00'],
|
|
name='dt2',
|
|
tz='Asia/Tokyo'))
|
|
|
|
pv = df.pivot(index='dt1', columns='dt2', values='data1')
|
|
tm.assert_frame_equal(pv, expected)
|
|
|
|
def test_pivot_periods(self):
|
|
df = DataFrame({'p1': [pd.Period('2013-01-01', 'D'),
|
|
pd.Period('2013-01-02', 'D'),
|
|
pd.Period('2013-01-01', 'D'),
|
|
pd.Period('2013-01-02', 'D')],
|
|
'p2': [pd.Period('2013-01', 'M'),
|
|
pd.Period('2013-01', 'M'),
|
|
pd.Period('2013-02', 'M'),
|
|
pd.Period('2013-02', 'M')],
|
|
'data1': np.arange(4, dtype='int64'),
|
|
'data2': np.arange(4, dtype='int64')})
|
|
|
|
exp_col1 = Index(['data1', 'data1', 'data2', 'data2'])
|
|
exp_col2 = pd.PeriodIndex(['2013-01', '2013-02'] * 2,
|
|
name='p2', freq='M')
|
|
exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2])
|
|
expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]],
|
|
index=pd.PeriodIndex(['2013-01-01', '2013-01-02'],
|
|
name='p1', freq='D'),
|
|
columns=exp_col)
|
|
|
|
pv = df.pivot(index='p1', columns='p2')
|
|
tm.assert_frame_equal(pv, expected)
|
|
|
|
expected = DataFrame([[0, 2], [1, 3]],
|
|
index=pd.PeriodIndex(['2013-01-01', '2013-01-02'],
|
|
name='p1', freq='D'),
|
|
columns=pd.PeriodIndex(['2013-01', '2013-02'],
|
|
name='p2', freq='M'))
|
|
|
|
pv = df.pivot(index='p1', columns='p2', values='data1')
|
|
tm.assert_frame_equal(pv, expected)
|
|
|
|
@pytest.mark.parametrize('values', [
|
|
['baz', 'zoo'], np.array(['baz', 'zoo']),
|
|
pd.Series(['baz', 'zoo']), pd.Index(['baz', 'zoo'])
|
|
])
|
|
def test_pivot_with_list_like_values(self, values):
|
|
# issue #17160
|
|
df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
|
|
'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
|
|
'baz': [1, 2, 3, 4, 5, 6],
|
|
'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
|
|
|
|
result = df.pivot(index='foo', columns='bar', values=values)
|
|
|
|
data = [[1, 2, 3, 'x', 'y', 'z'],
|
|
[4, 5, 6, 'q', 'w', 't']]
|
|
index = Index(data=['one', 'two'], name='foo')
|
|
columns = MultiIndex(levels=[['baz', 'zoo'], ['A', 'B', 'C']],
|
|
labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]],
|
|
names=[None, 'bar'])
|
|
expected = DataFrame(data=data, index=index,
|
|
columns=columns, dtype='object')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize('values', [
|
|
['bar', 'baz'], np.array(['bar', 'baz']),
|
|
pd.Series(['bar', 'baz']), pd.Index(['bar', 'baz'])
|
|
])
|
|
def test_pivot_with_list_like_values_nans(self, values):
|
|
# issue #17160
|
|
df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
|
|
'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
|
|
'baz': [1, 2, 3, 4, 5, 6],
|
|
'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
|
|
|
|
result = df.pivot(index='zoo', columns='foo', values=values)
|
|
|
|
data = [[np.nan, 'A', np.nan, 4],
|
|
[np.nan, 'C', np.nan, 6],
|
|
[np.nan, 'B', np.nan, 5],
|
|
['A', np.nan, 1, np.nan],
|
|
['B', np.nan, 2, np.nan],
|
|
['C', np.nan, 3, np.nan]]
|
|
index = Index(data=['q', 't', 'w', 'x', 'y', 'z'], name='zoo')
|
|
columns = MultiIndex(levels=[['bar', 'baz'], ['one', 'two']],
|
|
labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
|
|
names=[None, 'foo'])
|
|
expected = DataFrame(data=data, index=index,
|
|
columns=columns, dtype='object')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.xfail(reason='MultiIndexed unstack with tuple names fails'
|
|
'with KeyError #19966')
|
|
def test_pivot_with_multiindex(self):
|
|
# issue #17160
|
|
index = Index(data=[0, 1, 2, 3, 4, 5])
|
|
data = [['one', 'A', 1, 'x'],
|
|
['one', 'B', 2, 'y'],
|
|
['one', 'C', 3, 'z'],
|
|
['two', 'A', 4, 'q'],
|
|
['two', 'B', 5, 'w'],
|
|
['two', 'C', 6, 't']]
|
|
columns = MultiIndex(levels=[['bar', 'baz'], ['first', 'second']],
|
|
labels=[[0, 0, 1, 1], [0, 1, 0, 1]])
|
|
df = DataFrame(data=data, index=index, columns=columns, dtype='object')
|
|
result = df.pivot(index=('bar', 'first'), columns=('bar', 'second'),
|
|
values=('baz', 'first'))
|
|
|
|
data = {'A': Series([1, 4], index=['one', 'two']),
|
|
'B': Series([2, 5], index=['one', 'two']),
|
|
'C': Series([3, 6], index=['one', 'two'])}
|
|
expected = DataFrame(data)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_pivot_with_tuple_of_values(self):
|
|
# issue #17160
|
|
df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
|
|
'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
|
|
'baz': [1, 2, 3, 4, 5, 6],
|
|
'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
|
|
with pytest.raises(KeyError):
|
|
# tuple is seen as a single column name
|
|
df.pivot(index='zoo', columns='foo', values=('bar', 'baz'))
|
|
|
|
def test_margins(self):
|
|
def _check_output(result, values_col, index=['A', 'B'],
|
|
columns=['C'],
|
|
margins_col='All'):
|
|
col_margins = result.loc[result.index[:-1], margins_col]
|
|
expected_col_margins = self.data.groupby(index)[values_col].mean()
|
|
tm.assert_series_equal(col_margins, expected_col_margins,
|
|
check_names=False)
|
|
assert col_margins.name == margins_col
|
|
|
|
result = result.sort_index()
|
|
index_margins = result.loc[(margins_col, '')].iloc[:-1]
|
|
|
|
expected_ix_margins = self.data.groupby(columns)[values_col].mean()
|
|
tm.assert_series_equal(index_margins, expected_ix_margins,
|
|
check_names=False)
|
|
assert index_margins.name == (margins_col, '')
|
|
|
|
grand_total_margins = result.loc[(margins_col, ''), margins_col]
|
|
expected_total_margins = self.data[values_col].mean()
|
|
assert grand_total_margins == expected_total_margins
|
|
|
|
# column specified
|
|
result = self.data.pivot_table(values='D', index=['A', 'B'],
|
|
columns='C',
|
|
margins=True, aggfunc=np.mean)
|
|
_check_output(result, 'D')
|
|
|
|
# Set a different margins_name (not 'All')
|
|
result = self.data.pivot_table(values='D', index=['A', 'B'],
|
|
columns='C',
|
|
margins=True, aggfunc=np.mean,
|
|
margins_name='Totals')
|
|
_check_output(result, 'D', margins_col='Totals')
|
|
|
|
# no column specified
|
|
table = self.data.pivot_table(index=['A', 'B'], columns='C',
|
|
margins=True, aggfunc=np.mean)
|
|
for value_col in table.columns.levels[0]:
|
|
_check_output(table[value_col], value_col)
|
|
|
|
# no col
|
|
|
|
# to help with a buglet
|
|
self.data.columns = [k * 2 for k in self.data.columns]
|
|
table = self.data.pivot_table(index=['AA', 'BB'], margins=True,
|
|
aggfunc=np.mean)
|
|
for value_col in table.columns:
|
|
totals = table.loc[('All', ''), value_col]
|
|
assert totals == self.data[value_col].mean()
|
|
|
|
# no rows
|
|
rtable = self.data.pivot_table(columns=['AA', 'BB'], margins=True,
|
|
aggfunc=np.mean)
|
|
assert isinstance(rtable, Series)
|
|
|
|
table = self.data.pivot_table(index=['AA', 'BB'], margins=True,
|
|
aggfunc='mean')
|
|
for item in ['DD', 'EE', 'FF']:
|
|
totals = table.loc[('All', ''), item]
|
|
assert totals == self.data[item].mean()
|
|
|
|
# issue number #8349: pivot_table with margins and dictionary aggfunc
|
|
data = [
|
|
{'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2013,
|
|
'MONTH': 12, 'DAYS': 3, 'SALARY': 17},
|
|
{'JOB': 'Employ', 'NAME':
|
|
'Mary', 'YEAR': 2013, 'MONTH': 12, 'DAYS': 5, 'SALARY': 23},
|
|
{'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014,
|
|
'MONTH': 1, 'DAYS': 10, 'SALARY': 100},
|
|
{'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014,
|
|
'MONTH': 1, 'DAYS': 11, 'SALARY': 110},
|
|
{'JOB': 'Employ', 'NAME': 'Mary', 'YEAR': 2014,
|
|
'MONTH': 1, 'DAYS': 15, 'SALARY': 200},
|
|
{'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014,
|
|
'MONTH': 2, 'DAYS': 8, 'SALARY': 80},
|
|
{'JOB': 'Employ', 'NAME': 'Mary', 'YEAR': 2014,
|
|
'MONTH': 2, 'DAYS': 5, 'SALARY': 190},
|
|
]
|
|
|
|
df = DataFrame(data)
|
|
|
|
df = df.set_index(['JOB', 'NAME', 'YEAR', 'MONTH'], drop=False,
|
|
append=False)
|
|
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
result = df.pivot_table(index=['JOB', 'NAME'],
|
|
columns=['YEAR', 'MONTH'],
|
|
values=['DAYS', 'SALARY'],
|
|
aggfunc={'DAYS': 'mean', 'SALARY': 'sum'},
|
|
margins=True)
|
|
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
expected = df.pivot_table(index=['JOB', 'NAME'],
|
|
columns=['YEAR', 'MONTH'],
|
|
values=['DAYS'],
|
|
aggfunc='mean', margins=True)
|
|
|
|
tm.assert_frame_equal(result['DAYS'], expected['DAYS'])
|
|
|
|
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
|
|
expected = df.pivot_table(index=['JOB', 'NAME'],
|
|
columns=['YEAR', 'MONTH'],
|
|
values=['SALARY'],
|
|
aggfunc='sum', margins=True)
|
|
|
|
tm.assert_frame_equal(result['SALARY'], expected['SALARY'])
|
|
|
|
def test_margins_dtype(self):
|
|
# GH 17013
|
|
|
|
df = self.data.copy()
|
|
df[['D', 'E', 'F']] = np.arange(len(df) * 3).reshape(len(df), 3)
|
|
|
|
mi_val = list(product(['bar', 'foo'], ['one', 'two'])) + [('All', '')]
|
|
mi = MultiIndex.from_tuples(mi_val, names=('A', 'B'))
|
|
expected = DataFrame({'dull': [12, 21, 3, 9, 45],
|
|
'shiny': [33, 0, 36, 51, 120]},
|
|
index=mi).rename_axis('C', axis=1)
|
|
expected['All'] = expected['dull'] + expected['shiny']
|
|
|
|
result = df.pivot_table(values='D', index=['A', 'B'],
|
|
columns='C', margins=True,
|
|
aggfunc=np.sum, fill_value=0)
|
|
|
|
tm.assert_frame_equal(expected, result)
|
|
|
|
@pytest.mark.xfail(reason='GH 17035 (len of floats is casted back to '
|
|
'floats)')
|
|
def test_margins_dtype_len(self):
|
|
mi_val = list(product(['bar', 'foo'], ['one', 'two'])) + [('All', '')]
|
|
mi = MultiIndex.from_tuples(mi_val, names=('A', 'B'))
|
|
expected = DataFrame({'dull': [1, 1, 2, 1, 5],
|
|
'shiny': [2, 0, 2, 2, 6]},
|
|
index=mi).rename_axis('C', axis=1)
|
|
expected['All'] = expected['dull'] + expected['shiny']
|
|
|
|
result = self.data.pivot_table(values='D', index=['A', 'B'],
|
|
columns='C', margins=True,
|
|
aggfunc=len, fill_value=0)
|
|
|
|
tm.assert_frame_equal(expected, result)
|
|
|
|
def test_pivot_integer_columns(self):
|
|
# caused by upstream bug in unstack
|
|
|
|
d = date.min
|
|
data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'],
|
|
[d + timedelta(i)
|
|
for i in range(20)], [1.0]))
|
|
df = DataFrame(data)
|
|
table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2])
|
|
|
|
df2 = df.rename(columns=str)
|
|
table2 = df2.pivot_table(
|
|
values='4', index=['0', '1', '3'], columns=['2'])
|
|
|
|
tm.assert_frame_equal(table, table2, check_names=False)
|
|
|
|
def test_pivot_no_level_overlap(self):
|
|
# GH #1181
|
|
|
|
data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2,
|
|
'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2,
|
|
'c': (['foo'] * 4 + ['bar'] * 4) * 2,
|
|
'value': np.random.randn(16)})
|
|
|
|
table = data.pivot_table('value', index='a', columns=['b', 'c'])
|
|
|
|
grouped = data.groupby(['a', 'b', 'c'])['value'].mean()
|
|
expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all')
|
|
tm.assert_frame_equal(table, expected)
|
|
|
|
def test_pivot_columns_lexsorted(self):
|
|
|
|
n = 10000
|
|
|
|
dtype = np.dtype([
|
|
("Index", object),
|
|
("Symbol", object),
|
|
("Year", int),
|
|
("Month", int),
|
|
("Day", int),
|
|
("Quantity", int),
|
|
("Price", float),
|
|
])
|
|
|
|
products = np.array([
|
|
('SP500', 'ADBE'),
|
|
('SP500', 'NVDA'),
|
|
('SP500', 'ORCL'),
|
|
('NDQ100', 'AAPL'),
|
|
('NDQ100', 'MSFT'),
|
|
('NDQ100', 'GOOG'),
|
|
('FTSE', 'DGE.L'),
|
|
('FTSE', 'TSCO.L'),
|
|
('FTSE', 'GSK.L'),
|
|
], dtype=[('Index', object), ('Symbol', object)])
|
|
items = np.empty(n, dtype=dtype)
|
|
iproduct = np.random.randint(0, len(products), n)
|
|
items['Index'] = products['Index'][iproduct]
|
|
items['Symbol'] = products['Symbol'][iproduct]
|
|
dr = pd.date_range(date(2000, 1, 1),
|
|
date(2010, 12, 31))
|
|
dates = dr[np.random.randint(0, len(dr), n)]
|
|
items['Year'] = dates.year
|
|
items['Month'] = dates.month
|
|
items['Day'] = dates.day
|
|
items['Price'] = np.random.lognormal(4.0, 2.0, n)
|
|
|
|
df = DataFrame(items)
|
|
|
|
pivoted = df.pivot_table('Price', index=['Month', 'Day'],
|
|
columns=['Index', 'Symbol', 'Year'],
|
|
aggfunc='mean')
|
|
|
|
assert pivoted.columns.is_monotonic
|
|
|
|
def test_pivot_complex_aggfunc(self):
|
|
f = OrderedDict([('D', ['std']), ('E', ['sum'])])
|
|
expected = self.data.groupby(['A', 'B']).agg(f).unstack('B')
|
|
result = self.data.pivot_table(index='A', columns='B', aggfunc=f)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_margins_no_values_no_cols(self):
|
|
# Regression test on pivot table: no values or cols passed.
|
|
result = self.data[['A', 'B']].pivot_table(
|
|
index=['A', 'B'], aggfunc=len, margins=True)
|
|
result_list = result.tolist()
|
|
assert sum(result_list[:-1]) == result_list[-1]
|
|
|
|
def test_margins_no_values_two_rows(self):
|
|
# Regression test on pivot table: no values passed but rows are a
|
|
# multi-index
|
|
result = self.data[['A', 'B', 'C']].pivot_table(
|
|
index=['A', 'B'], columns='C', aggfunc=len, margins=True)
|
|
assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0]
|
|
|
|
def test_margins_no_values_one_row_one_col(self):
|
|
# Regression test on pivot table: no values passed but row and col
|
|
# defined
|
|
result = self.data[['A', 'B']].pivot_table(
|
|
index='A', columns='B', aggfunc=len, margins=True)
|
|
assert result.All.tolist() == [4.0, 7.0, 11.0]
|
|
|
|
def test_margins_no_values_two_row_two_cols(self):
|
|
# Regression test on pivot table: no values passed but rows and cols
|
|
# are multi-indexed
|
|
self.data['D'] = ['a', 'b', 'c', 'd',
|
|
'e', 'f', 'g', 'h', 'i', 'j', 'k']
|
|
result = self.data[['A', 'B', 'C', 'D']].pivot_table(
|
|
index=['A', 'B'], columns=['C', 'D'], aggfunc=len, margins=True)
|
|
assert result.All.tolist() == [3.0, 1.0, 4.0, 3.0, 11.0]
|
|
|
|
def test_pivot_table_with_margins_set_margin_name(self):
|
|
# see gh-3335
|
|
for margin_name in ['foo', 'one', 666, None, ['a', 'b']]:
|
|
with pytest.raises(ValueError):
|
|
# multi-index index
|
|
pivot_table(self.data, values='D', index=['A', 'B'],
|
|
columns=['C'], margins=True,
|
|
margins_name=margin_name)
|
|
with pytest.raises(ValueError):
|
|
# multi-index column
|
|
pivot_table(self.data, values='D', index=['C'],
|
|
columns=['A', 'B'], margins=True,
|
|
margins_name=margin_name)
|
|
with pytest.raises(ValueError):
|
|
# non-multi-index index/column
|
|
pivot_table(self.data, values='D', index=['A'],
|
|
columns=['B'], margins=True,
|
|
margins_name=margin_name)
|
|
|
|
def test_pivot_timegrouper(self):
|
|
df = DataFrame({
|
|
'Branch': 'A A A A A A A B'.split(),
|
|
'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
|
|
'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
|
|
'Date': [datetime(2013, 1, 1),
|
|
datetime(2013, 1, 1),
|
|
datetime(2013, 10, 1),
|
|
datetime(2013, 10, 2),
|
|
datetime(2013, 10, 1),
|
|
datetime(2013, 10, 2),
|
|
datetime(2013, 12, 2),
|
|
datetime(2013, 12, 2), ]}).set_index('Date')
|
|
|
|
expected = DataFrame(np.array([10, 18, 3], dtype='int64')
|
|
.reshape(1, 3),
|
|
index=[datetime(2013, 12, 31)],
|
|
columns='Carl Joe Mark'.split())
|
|
expected.index.name = 'Date'
|
|
expected.columns.name = 'Buyer'
|
|
|
|
result = pivot_table(df, index=Grouper(freq='A'), columns='Buyer',
|
|
values='Quantity', aggfunc=np.sum)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = pivot_table(df, index='Buyer', columns=Grouper(freq='A'),
|
|
values='Quantity', aggfunc=np.sum)
|
|
tm.assert_frame_equal(result, expected.T)
|
|
|
|
expected = DataFrame(np.array([1, np.nan, 3, 9, 18, np.nan])
|
|
.reshape(2, 3),
|
|
index=[datetime(2013, 1, 1),
|
|
datetime(2013, 7, 1)],
|
|
columns='Carl Joe Mark'.split())
|
|
expected.index.name = 'Date'
|
|
expected.columns.name = 'Buyer'
|
|
|
|
result = pivot_table(df, index=Grouper(freq='6MS'), columns='Buyer',
|
|
values='Quantity', aggfunc=np.sum)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS'),
|
|
values='Quantity', aggfunc=np.sum)
|
|
tm.assert_frame_equal(result, expected.T)
|
|
|
|
# passing the name
|
|
df = df.reset_index()
|
|
result = pivot_table(df, index=Grouper(freq='6MS', key='Date'),
|
|
columns='Buyer',
|
|
values='Quantity', aggfunc=np.sum)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = pivot_table(df, index='Buyer',
|
|
columns=Grouper(freq='6MS', key='Date'),
|
|
values='Quantity', aggfunc=np.sum)
|
|
tm.assert_frame_equal(result, expected.T)
|
|
|
|
pytest.raises(KeyError, lambda: pivot_table(
|
|
df, index=Grouper(freq='6MS', key='foo'),
|
|
columns='Buyer', values='Quantity', aggfunc=np.sum))
|
|
pytest.raises(KeyError, lambda: pivot_table(
|
|
df, index='Buyer',
|
|
columns=Grouper(freq='6MS', key='foo'),
|
|
values='Quantity', aggfunc=np.sum))
|
|
|
|
# passing the level
|
|
df = df.set_index('Date')
|
|
result = pivot_table(df, index=Grouper(freq='6MS', level='Date'),
|
|
columns='Buyer', values='Quantity',
|
|
aggfunc=np.sum)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = pivot_table(df, index='Buyer',
|
|
columns=Grouper(freq='6MS', level='Date'),
|
|
values='Quantity', aggfunc=np.sum)
|
|
tm.assert_frame_equal(result, expected.T)
|
|
|
|
pytest.raises(ValueError, lambda: pivot_table(
|
|
df, index=Grouper(freq='6MS', level='foo'),
|
|
columns='Buyer', values='Quantity', aggfunc=np.sum))
|
|
pytest.raises(ValueError, lambda: pivot_table(
|
|
df, index='Buyer',
|
|
columns=Grouper(freq='6MS', level='foo'),
|
|
values='Quantity', aggfunc=np.sum))
|
|
|
|
# double grouper
|
|
df = DataFrame({
|
|
'Branch': 'A A A A A A A B'.split(),
|
|
'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
|
|
'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
|
|
'Date': [datetime(2013, 11, 1, 13, 0), datetime(2013, 9, 1, 13, 5),
|
|
datetime(2013, 10, 1, 20, 0),
|
|
datetime(2013, 10, 2, 10, 0),
|
|
datetime(2013, 11, 1, 20, 0),
|
|
datetime(2013, 10, 2, 10, 0),
|
|
datetime(2013, 10, 2, 12, 0),
|
|
datetime(2013, 12, 5, 14, 0)],
|
|
'PayDay': [datetime(2013, 10, 4, 0, 0),
|
|
datetime(2013, 10, 15, 13, 5),
|
|
datetime(2013, 9, 5, 20, 0),
|
|
datetime(2013, 11, 2, 10, 0),
|
|
datetime(2013, 10, 7, 20, 0),
|
|
datetime(2013, 9, 5, 10, 0),
|
|
datetime(2013, 12, 30, 12, 0),
|
|
datetime(2013, 11, 20, 14, 0), ]})
|
|
|
|
result = pivot_table(df, index=Grouper(freq='M', key='Date'),
|
|
columns=Grouper(freq='M', key='PayDay'),
|
|
values='Quantity', aggfunc=np.sum)
|
|
expected = DataFrame(np.array([np.nan, 3, np.nan, np.nan,
|
|
6, np.nan, 1, 9,
|
|
np.nan, 9, np.nan, np.nan, np.nan,
|
|
np.nan, 3, np.nan]).reshape(4, 4),
|
|
index=[datetime(2013, 9, 30),
|
|
datetime(2013, 10, 31),
|
|
datetime(2013, 11, 30),
|
|
datetime(2013, 12, 31)],
|
|
columns=[datetime(2013, 9, 30),
|
|
datetime(2013, 10, 31),
|
|
datetime(2013, 11, 30),
|
|
datetime(2013, 12, 31)])
|
|
expected.index.name = 'Date'
|
|
expected.columns.name = 'PayDay'
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = pivot_table(df, index=Grouper(freq='M', key='PayDay'),
|
|
columns=Grouper(freq='M', key='Date'),
|
|
values='Quantity', aggfunc=np.sum)
|
|
tm.assert_frame_equal(result, expected.T)
|
|
|
|
tuples = [(datetime(2013, 9, 30), datetime(2013, 10, 31)),
|
|
(datetime(2013, 10, 31),
|
|
datetime(2013, 9, 30)),
|
|
(datetime(2013, 10, 31),
|
|
datetime(2013, 11, 30)),
|
|
(datetime(2013, 10, 31),
|
|
datetime(2013, 12, 31)),
|
|
(datetime(2013, 11, 30),
|
|
datetime(2013, 10, 31)),
|
|
(datetime(2013, 12, 31), datetime(2013, 11, 30)), ]
|
|
idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay'])
|
|
expected = DataFrame(np.array([3, np.nan, 6, np.nan, 1, np.nan,
|
|
9, np.nan, 9, np.nan,
|
|
np.nan, 3]).reshape(6, 2),
|
|
index=idx, columns=['A', 'B'])
|
|
expected.columns.name = 'Branch'
|
|
|
|
result = pivot_table(
|
|
df, index=[Grouper(freq='M', key='Date'),
|
|
Grouper(freq='M', key='PayDay')], columns=['Branch'],
|
|
values='Quantity', aggfunc=np.sum)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = pivot_table(df, index=['Branch'],
|
|
columns=[Grouper(freq='M', key='Date'),
|
|
Grouper(freq='M', key='PayDay')],
|
|
values='Quantity', aggfunc=np.sum)
|
|
tm.assert_frame_equal(result, expected.T)
|
|
|
|
def test_pivot_datetime_tz(self):
|
|
dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
|
|
'2011-07-19 09:00:00',
|
|
'2011-07-19 07:00:00', '2011-07-19 08:00:00',
|
|
'2011-07-19 09:00:00']
|
|
dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00',
|
|
'2013-01-01 15:00:00',
|
|
'2013-02-01 15:00:00', '2013-02-01 15:00:00',
|
|
'2013-02-01 15:00:00']
|
|
df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
|
|
'dt1': dates1, 'dt2': dates2,
|
|
'value1': np.arange(6, dtype='int64'),
|
|
'value2': [1, 2] * 3})
|
|
df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific'))
|
|
df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo'))
|
|
|
|
exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00',
|
|
'2011-07-19 08:00:00',
|
|
'2011-07-19 09:00:00'],
|
|
tz='US/Pacific', name='dt1')
|
|
exp_col1 = Index(['value1', 'value1'])
|
|
exp_col2 = Index(['a', 'b'], name='label')
|
|
exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
|
|
expected = DataFrame([[0, 3], [1, 4], [2, 5]],
|
|
index=exp_idx, columns=exp_col)
|
|
result = pivot_table(df, index=['dt1'], columns=[
|
|
'label'], values=['value1'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
exp_col1 = Index(['sum', 'sum', 'sum', 'sum',
|
|
'mean', 'mean', 'mean', 'mean'])
|
|
exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2)
|
|
exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00',
|
|
'2013-02-01 15:00:00'] * 4,
|
|
tz='Asia/Tokyo', name='dt2')
|
|
exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3])
|
|
expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2],
|
|
[1, 4, 2, 1, 1, 4, 2, 1],
|
|
[2, 5, 1, 2, 2, 5, 1, 2]],
|
|
dtype='int64'),
|
|
index=exp_idx,
|
|
columns=exp_col)
|
|
|
|
result = pivot_table(df, index=['dt1'], columns=['dt2'],
|
|
values=['value1', 'value2'],
|
|
aggfunc=[np.sum, np.mean])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_pivot_dtaccessor(self):
|
|
# GH 8103
|
|
dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00',
|
|
'2011-07-19 09:00:00',
|
|
'2011-07-19 07:00:00', '2011-07-19 08:00:00',
|
|
'2011-07-19 09:00:00']
|
|
dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00',
|
|
'2013-01-01 15:00:00',
|
|
'2013-02-01 15:00:00', '2013-02-01 15:00:00',
|
|
'2013-02-01 15:00:00']
|
|
df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
|
|
'dt1': dates1, 'dt2': dates2,
|
|
'value1': np.arange(6, dtype='int64'),
|
|
'value2': [1, 2] * 3})
|
|
df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d))
|
|
df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d))
|
|
|
|
result = pivot_table(df, index='label', columns=df['dt1'].dt.hour,
|
|
values='value1')
|
|
|
|
exp_idx = Index(['a', 'b'], name='label')
|
|
expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]},
|
|
index=exp_idx,
|
|
columns=Index([7, 8, 9], name='dt1'))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = pivot_table(df, index=df['dt2'].dt.month,
|
|
columns=df['dt1'].dt.hour,
|
|
values='value1')
|
|
|
|
expected = DataFrame({7: [0, 3], 8: [1, 4], 9: [2, 5]},
|
|
index=Index([1, 2], name='dt2'),
|
|
columns=Index([7, 8, 9], name='dt1'))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = pivot_table(df, index=df['dt2'].dt.year.values,
|
|
columns=[df['dt1'].dt.hour, df['dt2'].dt.month],
|
|
values='value1')
|
|
|
|
exp_col = MultiIndex.from_arrays(
|
|
[[7, 7, 8, 8, 9, 9], [1, 2] * 3], names=['dt1', 'dt2'])
|
|
expected = DataFrame(np.array([[0, 3, 1, 4, 2, 5]], dtype='int64'),
|
|
index=[2013], columns=exp_col)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = pivot_table(df, index=np.array(['X', 'X', 'X',
|
|
'X', 'Y', 'Y']),
|
|
columns=[df['dt1'].dt.hour, df['dt2'].dt.month],
|
|
values='value1')
|
|
expected = DataFrame(np.array([[0, 3, 1, np.nan, 2, np.nan],
|
|
[np.nan, np.nan, np.nan,
|
|
4, np.nan, 5]]),
|
|
index=['X', 'Y'], columns=exp_col)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_daily(self):
|
|
rng = date_range('1/1/2000', '12/31/2004', freq='D')
|
|
ts = Series(np.random.randn(len(rng)), index=rng)
|
|
|
|
annual = pivot_table(DataFrame(ts), index=ts.index.year,
|
|
columns=ts.index.dayofyear)
|
|
annual.columns = annual.columns.droplevel(0)
|
|
|
|
doy = np.asarray(ts.index.dayofyear)
|
|
|
|
for i in range(1, 367):
|
|
subset = ts[doy == i]
|
|
subset.index = subset.index.year
|
|
|
|
result = annual[i].dropna()
|
|
tm.assert_series_equal(result, subset, check_names=False)
|
|
assert result.name == i
|
|
|
|
def test_monthly(self):
|
|
rng = date_range('1/1/2000', '12/31/2004', freq='M')
|
|
ts = Series(np.random.randn(len(rng)), index=rng)
|
|
|
|
annual = pivot_table(pd.DataFrame(ts), index=ts.index.year,
|
|
columns=ts.index.month)
|
|
annual.columns = annual.columns.droplevel(0)
|
|
|
|
month = ts.index.month
|
|
for i in range(1, 13):
|
|
subset = ts[month == i]
|
|
subset.index = subset.index.year
|
|
result = annual[i].dropna()
|
|
tm.assert_series_equal(result, subset, check_names=False)
|
|
assert result.name == i
|
|
|
|
def test_pivot_table_with_iterator_values(self):
|
|
# GH 12017
|
|
aggs = {'D': 'sum', 'E': 'mean'}
|
|
|
|
pivot_values_list = pd.pivot_table(
|
|
self.data, index=['A'], values=list(aggs.keys()), aggfunc=aggs,
|
|
)
|
|
|
|
pivot_values_keys = pd.pivot_table(
|
|
self.data, index=['A'], values=aggs.keys(), aggfunc=aggs,
|
|
)
|
|
tm.assert_frame_equal(pivot_values_keys, pivot_values_list)
|
|
|
|
agg_values_gen = (value for value in aggs.keys())
|
|
pivot_values_gen = pd.pivot_table(
|
|
self.data, index=['A'], values=agg_values_gen, aggfunc=aggs,
|
|
)
|
|
tm.assert_frame_equal(pivot_values_gen, pivot_values_list)
|
|
|
|
def test_pivot_table_margins_name_with_aggfunc_list(self):
|
|
# GH 13354
|
|
margins_name = 'Weekly'
|
|
costs = pd.DataFrame(
|
|
{'item': ['bacon', 'cheese', 'bacon', 'cheese'],
|
|
'cost': [2.5, 4.5, 3.2, 3.3],
|
|
'day': ['M', 'M', 'T', 'T']}
|
|
)
|
|
table = costs.pivot_table(
|
|
index="item", columns="day", margins=True,
|
|
margins_name=margins_name, aggfunc=[np.mean, max]
|
|
)
|
|
ix = pd.Index(
|
|
['bacon', 'cheese', margins_name], dtype='object', name='item'
|
|
)
|
|
tups = [('mean', 'cost', 'M'), ('mean', 'cost', 'T'),
|
|
('mean', 'cost', margins_name), ('max', 'cost', 'M'),
|
|
('max', 'cost', 'T'), ('max', 'cost', margins_name)]
|
|
cols = pd.MultiIndex.from_tuples(tups, names=[None, None, 'day'])
|
|
expected = pd.DataFrame(table.values, index=ix, columns=cols)
|
|
tm.assert_frame_equal(table, expected)
|
|
|
|
@pytest.mark.xfail(reason='GH 17035 (np.mean of ints is casted back to '
|
|
'ints)')
|
|
def test_categorical_margins(self, observed):
|
|
# GH 10989
|
|
df = pd.DataFrame({'x': np.arange(8),
|
|
'y': np.arange(8) // 4,
|
|
'z': np.arange(8) % 2})
|
|
|
|
expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]])
|
|
expected.index = Index([0, 1, 'All'], name='y')
|
|
expected.columns = Index([0, 1, 'All'], name='z')
|
|
|
|
table = df.pivot_table('x', 'y', 'z', dropna=observed, margins=True)
|
|
tm.assert_frame_equal(table, expected)
|
|
|
|
@pytest.mark.xfail(reason='GH 17035 (np.mean of ints is casted back to '
|
|
'ints)')
|
|
def test_categorical_margins_category(self, observed):
|
|
df = pd.DataFrame({'x': np.arange(8),
|
|
'y': np.arange(8) // 4,
|
|
'z': np.arange(8) % 2})
|
|
|
|
expected = pd.DataFrame([[1.0, 2.0, 1.5], [5, 6, 5.5], [3, 4, 3.5]])
|
|
expected.index = Index([0, 1, 'All'], name='y')
|
|
expected.columns = Index([0, 1, 'All'], name='z')
|
|
|
|
df.y = df.y.astype('category')
|
|
df.z = df.z.astype('category')
|
|
table = df.pivot_table('x', 'y', 'z', dropna=observed, margins=True)
|
|
tm.assert_frame_equal(table, expected)
|
|
|
|
def test_categorical_aggfunc(self, observed):
|
|
# GH 9534
|
|
df = pd.DataFrame({"C1": ["A", "B", "C", "C"],
|
|
"C2": ["a", "a", "b", "b"],
|
|
"V": [1, 2, 3, 4]})
|
|
df["C1"] = df["C1"].astype("category")
|
|
result = df.pivot_table("V", index="C1", columns="C2",
|
|
dropna=observed, aggfunc="count")
|
|
|
|
expected_index = pd.CategoricalIndex(['A', 'B', 'C'],
|
|
categories=['A', 'B', 'C'],
|
|
ordered=False,
|
|
name='C1')
|
|
expected_columns = pd.Index(['a', 'b'], name='C2')
|
|
expected_data = np.array([[1., np.nan],
|
|
[1., np.nan],
|
|
[np.nan, 2.]])
|
|
expected = pd.DataFrame(expected_data,
|
|
index=expected_index,
|
|
columns=expected_columns)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_categorical_pivot_index_ordering(self, observed):
|
|
# GH 8731
|
|
df = pd.DataFrame({'Sales': [100, 120, 220],
|
|
'Month': ['January', 'January', 'January'],
|
|
'Year': [2013, 2014, 2013]})
|
|
months = ['January', 'February', 'March', 'April', 'May', 'June',
|
|
'July', 'August', 'September', 'October', 'November',
|
|
'December']
|
|
df['Month'] = df['Month'].astype('category').cat.set_categories(months)
|
|
result = df.pivot_table(values='Sales',
|
|
index='Month',
|
|
columns='Year',
|
|
dropna=observed,
|
|
aggfunc='sum')
|
|
expected_columns = pd.Int64Index([2013, 2014], name='Year')
|
|
expected_index = pd.CategoricalIndex(['January'],
|
|
categories=months,
|
|
ordered=False,
|
|
name='Month')
|
|
expected = pd.DataFrame([[320, 120]],
|
|
index=expected_index,
|
|
columns=expected_columns)
|
|
if not observed:
|
|
result = result.dropna().astype(np.int64)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_pivot_table_not_series(self):
|
|
# GH 4386
|
|
# pivot_table always returns a DataFrame
|
|
# when values is not list like and columns is None
|
|
# and aggfunc is not instance of list
|
|
df = DataFrame({'col1': [3, 4, 5],
|
|
'col2': ['C', 'D', 'E'],
|
|
'col3': [1, 3, 9]})
|
|
|
|
result = df.pivot_table('col1', index=['col3', 'col2'], aggfunc=np.sum)
|
|
m = MultiIndex.from_arrays([[1, 3, 9],
|
|
['C', 'D', 'E']],
|
|
names=['col3', 'col2'])
|
|
expected = DataFrame([3, 4, 5],
|
|
index=m, columns=['col1'])
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.pivot_table(
|
|
'col1', index='col3', columns='col2', aggfunc=np.sum
|
|
)
|
|
expected = DataFrame([[3, np.NaN, np.NaN],
|
|
[np.NaN, 4, np.NaN],
|
|
[np.NaN, np.NaN, 5]],
|
|
index=Index([1, 3, 9], name='col3'),
|
|
columns=Index(['C', 'D', 'E'], name='col2'))
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.pivot_table('col1', index='col3', aggfunc=[np.sum])
|
|
m = MultiIndex.from_arrays([['sum'],
|
|
['col1']])
|
|
expected = DataFrame([3, 4, 5],
|
|
index=Index([1, 3, 9], name='col3'),
|
|
columns=m)
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_pivot_margins_name_unicode(self):
|
|
# issue #13292
|
|
greek = u'\u0394\u03bf\u03ba\u03b9\u03bc\u03ae'
|
|
frame = pd.DataFrame({'foo': [1, 2, 3]})
|
|
table = pd.pivot_table(frame, index=['foo'], aggfunc=len, margins=True,
|
|
margins_name=greek)
|
|
index = pd.Index([1, 2, 3, greek], dtype='object', name='foo')
|
|
expected = pd.DataFrame(index=index)
|
|
tm.assert_frame_equal(table, expected)
|
|
|
|
def test_pivot_string_as_func(self):
|
|
# GH #18713
|
|
# for correctness purposes
|
|
data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar',
|
|
'bar', 'bar', 'foo', 'foo', 'foo'],
|
|
'B': ['one', 'one', 'one', 'two', 'one', 'one',
|
|
'one', 'two', 'two', 'two', 'one'],
|
|
'C': range(11)})
|
|
|
|
result = pivot_table(data, index='A', columns='B', aggfunc='sum')
|
|
mi = MultiIndex(levels=[['C'], ['one', 'two']],
|
|
labels=[[0, 0], [0, 1]], names=[None, 'B'])
|
|
expected = DataFrame({('C', 'one'): {'bar': 15, 'foo': 13},
|
|
('C', 'two'): {'bar': 7, 'foo': 20}},
|
|
columns=mi).rename_axis('A')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = pivot_table(data, index='A', columns='B',
|
|
aggfunc=['sum', 'mean'])
|
|
mi = MultiIndex(levels=[['sum', 'mean'], ['C'], ['one', 'two']],
|
|
labels=[[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 1]],
|
|
names=[None, None, 'B'])
|
|
expected = DataFrame({('mean', 'C', 'one'): {'bar': 5.0, 'foo': 3.25},
|
|
('mean', 'C', 'two'): {'bar': 7.0,
|
|
'foo': 6.666666666666667},
|
|
('sum', 'C', 'one'): {'bar': 15, 'foo': 13},
|
|
('sum', 'C', 'two'): {'bar': 7, 'foo': 20}},
|
|
columns=mi).rename_axis('A')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize('f, f_numpy',
|
|
[('sum', np.sum),
|
|
('mean', np.mean),
|
|
('std', np.std),
|
|
(['sum', 'mean'], [np.sum, np.mean]),
|
|
(['sum', 'std'], [np.sum, np.std]),
|
|
(['std', 'mean'], [np.std, np.mean])])
|
|
def test_pivot_string_func_vs_func(self, f, f_numpy):
|
|
# GH #18713
|
|
# for consistency purposes
|
|
result = pivot_table(self.data, index='A', columns='B', aggfunc=f)
|
|
expected = pivot_table(self.data, index='A', columns='B',
|
|
aggfunc=f_numpy)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
class TestCrosstab(object):
|
|
|
|
def setup_method(self, method):
|
|
df = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
|
|
'bar', 'bar', 'bar', 'bar',
|
|
'foo', 'foo', 'foo'],
|
|
'B': ['one', 'one', 'one', 'two',
|
|
'one', 'one', 'one', 'two',
|
|
'two', 'two', 'one'],
|
|
'C': ['dull', 'dull', 'shiny', 'dull',
|
|
'dull', 'shiny', 'shiny', 'dull',
|
|
'shiny', 'shiny', 'shiny'],
|
|
'D': np.random.randn(11),
|
|
'E': np.random.randn(11),
|
|
'F': np.random.randn(11)})
|
|
|
|
self.df = df.append(df, ignore_index=True)
|
|
|
|
def test_crosstab_single(self):
|
|
df = self.df
|
|
result = crosstab(df['A'], df['C'])
|
|
expected = df.groupby(['A', 'C']).size().unstack()
|
|
tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64))
|
|
|
|
def test_crosstab_multiple(self):
|
|
df = self.df
|
|
|
|
result = crosstab(df['A'], [df['B'], df['C']])
|
|
expected = df.groupby(['A', 'B', 'C']).size()
|
|
expected = expected.unstack(
|
|
'B').unstack('C').fillna(0).astype(np.int64)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = crosstab([df['B'], df['C']], df['A'])
|
|
expected = df.groupby(['B', 'C', 'A']).size()
|
|
expected = expected.unstack('A').fillna(0).astype(np.int64)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_crosstab_ndarray(self):
|
|
a = np.random.randint(0, 5, size=100)
|
|
b = np.random.randint(0, 3, size=100)
|
|
c = np.random.randint(0, 10, size=100)
|
|
|
|
df = DataFrame({'a': a, 'b': b, 'c': c})
|
|
|
|
result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'))
|
|
expected = crosstab(df['a'], [df['b'], df['c']])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = crosstab([b, c], a, colnames=['a'], rownames=('b', 'c'))
|
|
expected = crosstab([df['b'], df['c']], df['a'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# assign arbitrary names
|
|
result = crosstab(self.df['A'].values, self.df['C'].values)
|
|
assert result.index.name == 'row_0'
|
|
assert result.columns.name == 'col_0'
|
|
|
|
def test_crosstab_non_aligned(self):
|
|
# GH 17005
|
|
a = pd.Series([0, 1, 1], index=['a', 'b', 'c'])
|
|
b = pd.Series([3, 4, 3, 4, 3], index=['a', 'b', 'c', 'd', 'f'])
|
|
c = np.array([3, 4, 3])
|
|
|
|
expected = pd.DataFrame([[1, 0], [1, 1]],
|
|
index=Index([0, 1], name='row_0'),
|
|
columns=Index([3, 4], name='col_0'))
|
|
|
|
result = crosstab(a, b)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = crosstab(a, c)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_crosstab_margins(self):
|
|
a = np.random.randint(0, 7, size=100)
|
|
b = np.random.randint(0, 3, size=100)
|
|
c = np.random.randint(0, 5, size=100)
|
|
|
|
df = DataFrame({'a': a, 'b': b, 'c': c})
|
|
|
|
result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'),
|
|
margins=True)
|
|
|
|
assert result.index.names == ('a',)
|
|
assert result.columns.names == ['b', 'c']
|
|
|
|
all_cols = result['All', '']
|
|
exp_cols = df.groupby(['a']).size().astype('i8')
|
|
# to keep index.name
|
|
exp_margin = Series([len(df)], index=Index(['All'], name='a'))
|
|
exp_cols = exp_cols.append(exp_margin)
|
|
exp_cols.name = ('All', '')
|
|
|
|
tm.assert_series_equal(all_cols, exp_cols)
|
|
|
|
all_rows = result.loc['All']
|
|
exp_rows = df.groupby(['b', 'c']).size().astype('i8')
|
|
exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')]))
|
|
exp_rows.name = 'All'
|
|
|
|
exp_rows = exp_rows.reindex(all_rows.index)
|
|
exp_rows = exp_rows.fillna(0).astype(np.int64)
|
|
tm.assert_series_equal(all_rows, exp_rows)
|
|
|
|
def test_crosstab_margins_set_margin_name(self):
|
|
# GH 15972
|
|
a = np.random.randint(0, 7, size=100)
|
|
b = np.random.randint(0, 3, size=100)
|
|
c = np.random.randint(0, 5, size=100)
|
|
|
|
df = DataFrame({'a': a, 'b': b, 'c': c})
|
|
|
|
result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'),
|
|
margins=True, margins_name='TOTAL')
|
|
|
|
assert result.index.names == ('a',)
|
|
assert result.columns.names == ['b', 'c']
|
|
|
|
all_cols = result['TOTAL', '']
|
|
exp_cols = df.groupby(['a']).size().astype('i8')
|
|
# to keep index.name
|
|
exp_margin = Series([len(df)], index=Index(['TOTAL'], name='a'))
|
|
exp_cols = exp_cols.append(exp_margin)
|
|
exp_cols.name = ('TOTAL', '')
|
|
|
|
tm.assert_series_equal(all_cols, exp_cols)
|
|
|
|
all_rows = result.loc['TOTAL']
|
|
exp_rows = df.groupby(['b', 'c']).size().astype('i8')
|
|
exp_rows = exp_rows.append(Series([len(df)], index=[('TOTAL', '')]))
|
|
exp_rows.name = 'TOTAL'
|
|
|
|
exp_rows = exp_rows.reindex(all_rows.index)
|
|
exp_rows = exp_rows.fillna(0).astype(np.int64)
|
|
tm.assert_series_equal(all_rows, exp_rows)
|
|
|
|
for margins_name in [666, None, ['a', 'b']]:
|
|
with pytest.raises(ValueError):
|
|
crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'),
|
|
margins=True, margins_name=margins_name)
|
|
|
|
def test_crosstab_pass_values(self):
|
|
a = np.random.randint(0, 7, size=100)
|
|
b = np.random.randint(0, 3, size=100)
|
|
c = np.random.randint(0, 5, size=100)
|
|
values = np.random.randn(100)
|
|
|
|
table = crosstab([a, b], c, values, aggfunc=np.sum,
|
|
rownames=['foo', 'bar'], colnames=['baz'])
|
|
|
|
df = DataFrame({'foo': a, 'bar': b, 'baz': c, 'values': values})
|
|
|
|
expected = df.pivot_table('values', index=['foo', 'bar'],
|
|
columns='baz', aggfunc=np.sum)
|
|
tm.assert_frame_equal(table, expected)
|
|
|
|
def test_crosstab_dropna(self):
|
|
# GH 3820
|
|
a = np.array(['foo', 'foo', 'foo', 'bar',
|
|
'bar', 'foo', 'foo'], dtype=object)
|
|
b = np.array(['one', 'one', 'two', 'one',
|
|
'two', 'two', 'two'], dtype=object)
|
|
c = np.array(['dull', 'dull', 'dull', 'dull',
|
|
'dull', 'shiny', 'shiny'], dtype=object)
|
|
res = pd.crosstab(a, [b, c], rownames=['a'],
|
|
colnames=['b', 'c'], dropna=False)
|
|
m = MultiIndex.from_tuples([('one', 'dull'), ('one', 'shiny'),
|
|
('two', 'dull'), ('two', 'shiny')],
|
|
names=['b', 'c'])
|
|
tm.assert_index_equal(res.columns, m)
|
|
|
|
def test_crosstab_no_overlap(self):
|
|
# GS 10291
|
|
|
|
s1 = pd.Series([1, 2, 3], index=[1, 2, 3])
|
|
s2 = pd.Series([4, 5, 6], index=[4, 5, 6])
|
|
|
|
actual = crosstab(s1, s2)
|
|
expected = pd.DataFrame()
|
|
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
def test_margin_dropna(self):
|
|
# GH 12577
|
|
# pivot_table counts null into margin ('All')
|
|
# when margins=true and dropna=true
|
|
|
|
df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan],
|
|
'b': [3, 3, 4, 4, 4, 4]})
|
|
actual = pd.crosstab(df.a, df.b, margins=True, dropna=True)
|
|
expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]])
|
|
expected.index = Index([1.0, 2.0, 'All'], name='a')
|
|
expected.columns = Index([3, 4, 'All'], name='b')
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan],
|
|
'b': [3, np.nan, 4, 4, 4, 4]})
|
|
actual = pd.crosstab(df.a, df.b, margins=True, dropna=True)
|
|
expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
|
|
expected.index = Index([1.0, 2.0, 'All'], name='a')
|
|
expected.columns = Index([3.0, 4.0, 'All'], name='b')
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
df = DataFrame({'a': [1, np.nan, np.nan, np.nan, np.nan, 2],
|
|
'b': [3, 3, 4, 4, 4, 4]})
|
|
actual = pd.crosstab(df.a, df.b, margins=True, dropna=True)
|
|
expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
|
|
expected.index = Index([1.0, 2.0, 'All'], name='a')
|
|
expected.columns = Index([3, 4, 'All'], name='b')
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
# GH 12642
|
|
# _add_margins raises KeyError: Level None not found
|
|
# when margins=True and dropna=False
|
|
df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan],
|
|
'b': [3, 3, 4, 4, 4, 4]})
|
|
actual = pd.crosstab(df.a, df.b, margins=True, dropna=False)
|
|
expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]])
|
|
expected.index = Index([1.0, 2.0, 'All'], name='a')
|
|
expected.columns = Index([3, 4, 'All'], name='b')
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan],
|
|
'b': [3, np.nan, 4, 4, 4, 4]})
|
|
actual = pd.crosstab(df.a, df.b, margins=True, dropna=False)
|
|
expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]])
|
|
expected.index = Index([1.0, 2.0, 'All'], name='a')
|
|
expected.columns = Index([3.0, 4.0, 'All'], name='b')
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
a = np.array(['foo', 'foo', 'foo', 'bar',
|
|
'bar', 'foo', 'foo'], dtype=object)
|
|
b = np.array(['one', 'one', 'two', 'one',
|
|
'two', np.nan, 'two'], dtype=object)
|
|
c = np.array(['dull', 'dull', 'dull', 'dull',
|
|
'dull', 'shiny', 'shiny'], dtype=object)
|
|
|
|
actual = pd.crosstab(a, [b, c], rownames=['a'],
|
|
colnames=['b', 'c'], margins=True, dropna=False)
|
|
m = MultiIndex.from_arrays([['one', 'one', 'two', 'two', 'All'],
|
|
['dull', 'shiny', 'dull', 'shiny', '']],
|
|
names=['b', 'c'])
|
|
expected = DataFrame([[1, 0, 1, 0, 2], [2, 0, 1, 1, 5],
|
|
[3, 0, 2, 1, 7]], columns=m)
|
|
expected.index = Index(['bar', 'foo', 'All'], name='a')
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
actual = pd.crosstab([a, b], c, rownames=['a', 'b'],
|
|
colnames=['c'], margins=True, dropna=False)
|
|
m = MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo', 'All'],
|
|
['one', 'two', 'one', 'two', '']],
|
|
names=['a', 'b'])
|
|
expected = DataFrame([[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2],
|
|
[5, 2, 7]], index=m)
|
|
expected.columns = Index(['dull', 'shiny', 'All'], name='c')
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
actual = pd.crosstab([a, b], c, rownames=['a', 'b'],
|
|
colnames=['c'], margins=True, dropna=True)
|
|
m = MultiIndex.from_arrays([['bar', 'bar', 'foo', 'foo', 'All'],
|
|
['one', 'two', 'one', 'two', '']],
|
|
names=['a', 'b'])
|
|
expected = DataFrame([[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2],
|
|
[5, 1, 6]], index=m)
|
|
expected.columns = Index(['dull', 'shiny', 'All'], name='c')
|
|
tm.assert_frame_equal(actual, expected)
|
|
|
|
def test_crosstab_normalize(self):
|
|
# Issue 12578
|
|
df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
|
|
'c': [1, 1, np.nan, 1, 1]})
|
|
|
|
rindex = pd.Index([1, 2], name='a')
|
|
cindex = pd.Index([3, 4], name='b')
|
|
full_normal = pd.DataFrame([[0.2, 0], [0.2, 0.6]],
|
|
index=rindex, columns=cindex)
|
|
row_normal = pd.DataFrame([[1.0, 0], [0.25, 0.75]],
|
|
index=rindex, columns=cindex)
|
|
col_normal = pd.DataFrame([[0.5, 0], [0.5, 1.0]],
|
|
index=rindex, columns=cindex)
|
|
|
|
# Check all normalize args
|
|
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='all'),
|
|
full_normal)
|
|
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True),
|
|
full_normal)
|
|
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index'),
|
|
row_normal)
|
|
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns'),
|
|
col_normal)
|
|
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=1),
|
|
pd.crosstab(df.a, df.b, normalize='columns'))
|
|
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=0),
|
|
pd.crosstab(df.a, df.b, normalize='index'))
|
|
|
|
row_normal_margins = pd.DataFrame([[1.0, 0],
|
|
[0.25, 0.75],
|
|
[0.4, 0.6]],
|
|
index=pd.Index([1, 2, 'All'],
|
|
name='a',
|
|
dtype='object'),
|
|
columns=pd.Index([3, 4], name='b',
|
|
dtype='object'))
|
|
col_normal_margins = pd.DataFrame([[0.5, 0, 0.2], [0.5, 1.0, 0.8]],
|
|
index=pd.Index([1, 2], name='a',
|
|
dtype='object'),
|
|
columns=pd.Index([3, 4, 'All'],
|
|
name='b',
|
|
dtype='object'))
|
|
|
|
all_normal_margins = pd.DataFrame([[0.2, 0, 0.2],
|
|
[0.2, 0.6, 0.8],
|
|
[0.4, 0.6, 1]],
|
|
index=pd.Index([1, 2, 'All'],
|
|
name='a',
|
|
dtype='object'),
|
|
columns=pd.Index([3, 4, 'All'],
|
|
name='b',
|
|
dtype='object'))
|
|
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index',
|
|
margins=True), row_normal_margins)
|
|
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns',
|
|
margins=True), col_normal_margins)
|
|
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True,
|
|
margins=True), all_normal_margins)
|
|
|
|
# Test arrays
|
|
pd.crosstab([np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])],
|
|
np.array([1, 2, 1, 2]))
|
|
|
|
# Test with aggfunc
|
|
norm_counts = pd.DataFrame([[0.25, 0, 0.25],
|
|
[0.25, 0.5, 0.75],
|
|
[0.5, 0.5, 1]],
|
|
index=pd.Index([1, 2, 'All'],
|
|
name='a',
|
|
dtype='object'),
|
|
columns=pd.Index([3, 4, 'All'],
|
|
name='b'))
|
|
test_case = pd.crosstab(df.a, df.b, df.c, aggfunc='count',
|
|
normalize='all',
|
|
margins=True)
|
|
tm.assert_frame_equal(test_case, norm_counts)
|
|
|
|
df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
|
|
'c': [0, 4, np.nan, 3, 3]})
|
|
|
|
norm_sum = pd.DataFrame([[0, 0, 0.],
|
|
[0.4, 0.6, 1],
|
|
[0.4, 0.6, 1]],
|
|
index=pd.Index([1, 2, 'All'],
|
|
name='a',
|
|
dtype='object'),
|
|
columns=pd.Index([3, 4, 'All'],
|
|
name='b',
|
|
dtype='object'))
|
|
test_case = pd.crosstab(df.a, df.b, df.c, aggfunc=np.sum,
|
|
normalize='all',
|
|
margins=True)
|
|
tm.assert_frame_equal(test_case, norm_sum)
|
|
|
|
def test_crosstab_with_empties(self):
|
|
# Check handling of empties
|
|
df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
|
|
'c': [np.nan, np.nan, np.nan, np.nan, np.nan]})
|
|
|
|
empty = pd.DataFrame([[0.0, 0.0], [0.0, 0.0]],
|
|
index=pd.Index([1, 2],
|
|
name='a',
|
|
dtype='int64'),
|
|
columns=pd.Index([3, 4], name='b'))
|
|
|
|
for i in [True, 'index', 'columns']:
|
|
calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count',
|
|
normalize=i)
|
|
tm.assert_frame_equal(empty, calculated)
|
|
|
|
nans = pd.DataFrame([[0.0, np.nan], [0.0, 0.0]],
|
|
index=pd.Index([1, 2],
|
|
name='a',
|
|
dtype='int64'),
|
|
columns=pd.Index([3, 4], name='b'))
|
|
|
|
calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count',
|
|
normalize=False)
|
|
tm.assert_frame_equal(nans, calculated)
|
|
|
|
def test_crosstab_errors(self):
|
|
# Issue 12578
|
|
|
|
df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
|
|
'c': [1, 1, np.nan, 1, 1]})
|
|
|
|
error = 'values cannot be used without an aggfunc.'
|
|
with tm.assert_raises_regex(ValueError, error):
|
|
pd.crosstab(df.a, df.b, values=df.c)
|
|
|
|
error = 'aggfunc cannot be used without values'
|
|
with tm.assert_raises_regex(ValueError, error):
|
|
pd.crosstab(df.a, df.b, aggfunc=np.mean)
|
|
|
|
error = 'Not a valid normalize argument'
|
|
with tm.assert_raises_regex(ValueError, error):
|
|
pd.crosstab(df.a, df.b, normalize='42')
|
|
|
|
with tm.assert_raises_regex(ValueError, error):
|
|
pd.crosstab(df.a, df.b, normalize=42)
|
|
|
|
error = 'Not a valid margins argument'
|
|
with tm.assert_raises_regex(ValueError, error):
|
|
pd.crosstab(df.a, df.b, normalize='all', margins=42)
|
|
|
|
def test_crosstab_with_categorial_columns(self):
|
|
# GH 8860
|
|
df = pd.DataFrame({'MAKE': ['Honda', 'Acura', 'Tesla',
|
|
'Honda', 'Honda', 'Acura'],
|
|
'MODEL': ['Sedan', 'Sedan', 'Electric',
|
|
'Pickup', 'Sedan', 'Sedan']})
|
|
categories = ['Sedan', 'Electric', 'Pickup']
|
|
df['MODEL'] = (df['MODEL'].astype('category')
|
|
.cat.set_categories(categories))
|
|
result = pd.crosstab(df['MAKE'], df['MODEL'])
|
|
|
|
expected_index = pd.Index(['Acura', 'Honda', 'Tesla'], name='MAKE')
|
|
expected_columns = pd.CategoricalIndex(categories,
|
|
categories=categories,
|
|
ordered=False,
|
|
name='MODEL')
|
|
expected_data = [[2, 0, 0], [2, 0, 1], [0, 1, 0]]
|
|
expected = pd.DataFrame(expected_data,
|
|
index=expected_index,
|
|
columns=expected_columns)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_crosstab_with_numpy_size(self):
|
|
# GH 4003
|
|
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 6,
|
|
'B': ['A', 'B', 'C'] * 8,
|
|
'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4,
|
|
'D': np.random.randn(24),
|
|
'E': np.random.randn(24)})
|
|
result = pd.crosstab(index=[df['A'], df['B']],
|
|
columns=[df['C']],
|
|
margins=True,
|
|
aggfunc=np.size,
|
|
values=df['D'])
|
|
expected_index = pd.MultiIndex(levels=[['All', 'one', 'three', 'two'],
|
|
['', 'A', 'B', 'C']],
|
|
labels=[[1, 1, 1, 2, 2, 2, 3, 3, 3, 0],
|
|
[1, 2, 3, 1, 2, 3, 1, 2, 3, 0]],
|
|
names=['A', 'B'])
|
|
expected_column = pd.Index(['bar', 'foo', 'All'],
|
|
dtype='object',
|
|
name='C')
|
|
expected_data = np.array([[2., 2., 4.],
|
|
[2., 2., 4.],
|
|
[2., 2., 4.],
|
|
[2., np.nan, 2.],
|
|
[np.nan, 2., 2.],
|
|
[2., np.nan, 2.],
|
|
[np.nan, 2., 2.],
|
|
[2., np.nan, 2.],
|
|
[np.nan, 2., 2.],
|
|
[12., 12., 24.]])
|
|
expected = pd.DataFrame(expected_data,
|
|
index=expected_index,
|
|
columns=expected_column)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_crosstab_dup_index_names(self):
|
|
# GH 13279
|
|
s = pd.Series(range(3), name='foo')
|
|
|
|
result = pd.crosstab(s, s)
|
|
expected_index = pd.Index(range(3), name='foo')
|
|
expected = pd.DataFrame(np.eye(3, dtype=np.int64),
|
|
index=expected_index,
|
|
columns=expected_index)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize("names", [['a', ('b', 'c')],
|
|
[('a', 'b'), 'c']])
|
|
def test_crosstab_tuple_name(self, names):
|
|
s1 = pd.Series(range(3), name=names[0])
|
|
s2 = pd.Series(range(1, 4), name=names[1])
|
|
|
|
mi = pd.MultiIndex.from_arrays([range(3), range(1, 4)], names=names)
|
|
expected = pd.Series(1, index=mi).unstack(1, fill_value=0)
|
|
|
|
result = pd.crosstab(s1, s2)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_crosstab_unsorted_order(self):
|
|
df = pd.DataFrame({"b": [3, 1, 2], 'a': [5, 4, 6]},
|
|
index=['C', 'A', 'B'])
|
|
result = pd.crosstab(df.index, [df.b, df.a])
|
|
e_idx = pd.Index(['A', 'B', 'C'], name='row_0')
|
|
e_columns = pd.MultiIndex.from_tuples([(1, 4), (2, 6), (3, 5)],
|
|
names=['b', 'a'])
|
|
expected = pd.DataFrame([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
|
|
index=e_idx,
|
|
columns=e_columns)
|
|
tm.assert_frame_equal(result, expected)
|