laywerrobot/lib/python3.6/site-packages/pandas/tests/frame/test_reshape.py

913 lines
36 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
# -*- coding: utf-8 -*-
from __future__ import print_function
from warnings import catch_warnings
from datetime import datetime
import itertools
import pytest
from numpy.random import randn
from numpy import nan
import numpy as np
from pandas.compat import u
from pandas import (DataFrame, Index, Series, MultiIndex, date_range,
Timedelta, Period)
import pandas as pd
from pandas.util.testing import assert_series_equal, assert_frame_equal
import pandas.util.testing as tm
from pandas.tests.frame.common import TestData
class TestDataFrameReshape(TestData):
def test_pivot(self):
data = {
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
'values': [1., 2., 3., 3., 2., 1.]
}
frame = DataFrame(data)
pivoted = frame.pivot(
index='index', columns='columns', values='values')
expected = DataFrame({
'One': {'A': 1., 'B': 2., 'C': 3.},
'Two': {'A': 1., 'B': 2., 'C': 3.}
})
expected.index.name, expected.columns.name = 'index', 'columns'
tm.assert_frame_equal(pivoted, expected)
# name tracking
assert pivoted.index.name == 'index'
assert pivoted.columns.name == 'columns'
# don't specify values
pivoted = frame.pivot(index='index', columns='columns')
assert pivoted.index.name == 'index'
assert pivoted.columns.names == (None, 'columns')
with catch_warnings(record=True):
# pivot multiple columns
wp = tm.makePanel()
lp = wp.to_frame()
df = lp.reset_index()
tm.assert_frame_equal(df.pivot('major', 'minor'), lp.unstack())
def test_pivot_duplicates(self):
data = DataFrame({'a': ['bar', 'bar', 'foo', 'foo', 'foo'],
'b': ['one', 'two', 'one', 'one', 'two'],
'c': [1., 2., 3., 3., 4.]})
with tm.assert_raises_regex(ValueError, 'duplicate entries'):
data.pivot('a', 'b', 'c')
def test_pivot_empty(self):
df = DataFrame({}, columns=['a', 'b', 'c'])
result = df.pivot('a', 'b', 'c')
expected = DataFrame({})
tm.assert_frame_equal(result, expected, check_names=False)
def test_pivot_integer_bug(self):
df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")])
result = df.pivot(index=1, columns=0, values=2)
repr(result)
tm.assert_index_equal(result.columns, Index(['A', 'B'], name=0))
def test_pivot_index_none(self):
# gh-3962
data = {
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
'values': [1., 2., 3., 3., 2., 1.]
}
frame = DataFrame(data).set_index('index')
result = frame.pivot(columns='columns', values='values')
expected = DataFrame({
'One': {'A': 1., 'B': 2., 'C': 3.},
'Two': {'A': 1., 'B': 2., 'C': 3.}
})
expected.index.name, expected.columns.name = 'index', 'columns'
assert_frame_equal(result, expected)
# omit values
result = frame.pivot(columns='columns')
expected.columns = pd.MultiIndex.from_tuples([('values', 'One'),
('values', 'Two')],
names=[None, 'columns'])
expected.index.name = 'index'
tm.assert_frame_equal(result, expected, check_names=False)
assert result.index.name == 'index'
assert result.columns.names == (None, 'columns')
expected.columns = expected.columns.droplevel(0)
result = frame.pivot(columns='columns', values='values')
expected.columns.name = 'columns'
tm.assert_frame_equal(result, expected)
def test_stack_unstack(self):
df = self.frame.copy()
df[:] = np.arange(np.prod(df.shape)).reshape(df.shape)
stacked = df.stack()
stacked_df = DataFrame({'foo': stacked, 'bar': stacked})
unstacked = stacked.unstack()
unstacked_df = stacked_df.unstack()
assert_frame_equal(unstacked, df)
assert_frame_equal(unstacked_df['bar'], df)
unstacked_cols = stacked.unstack(0)
unstacked_cols_df = stacked_df.unstack(0)
assert_frame_equal(unstacked_cols.T, df)
assert_frame_equal(unstacked_cols_df['bar'].T, df)
def test_stack_mixed_level(self):
# GH 18310
levels = [range(3), [3, 'a', 'b'], [1, 2]]
# flat columns:
df = DataFrame(1, index=levels[0], columns=levels[1])
result = df.stack()
expected = Series(1, index=MultiIndex.from_product(levels[:2]))
assert_series_equal(result, expected)
# MultiIndex columns:
df = DataFrame(1, index=levels[0],
columns=MultiIndex.from_product(levels[1:]))
result = df.stack(1)
expected = DataFrame(1, index=MultiIndex.from_product([levels[0],
levels[2]]),
columns=levels[1])
assert_frame_equal(result, expected)
# as above, but used labels in level are actually of homogeneous type
result = df[['a', 'b']].stack(1)
expected = expected[['a', 'b']]
assert_frame_equal(result, expected)
def test_unstack_fill(self):
# GH #9746: fill_value keyword argument for Series
# and DataFrame unstack
# From a series
data = Series([1, 2, 4, 5], dtype=np.int16)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
result = data.unstack(fill_value=-1)
expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]},
index=['x', 'y', 'z'], dtype=np.int16)
assert_frame_equal(result, expected)
# From a series with incorrect data type for fill_value
result = data.unstack(fill_value=0.5)
expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]},
index=['x', 'y', 'z'], dtype=np.float)
assert_frame_equal(result, expected)
# GH #13971: fill_value when unstacking multiple levels:
df = DataFrame({'x': ['a', 'a', 'b'],
'y': ['j', 'k', 'j'],
'z': [0, 1, 2],
'w': [0, 1, 2]}).set_index(['x', 'y', 'z'])
unstacked = df.unstack(['x', 'y'], fill_value=0)
key = ('w', 'b', 'j')
expected = unstacked[key]
result = pd.Series([0, 0, 2], index=unstacked.index, name=key)
assert_series_equal(result, expected)
stacked = unstacked.stack(['x', 'y'])
stacked.index = stacked.index.reorder_levels(df.index.names)
# Workaround for GH #17886 (unnecessarily casts to float):
stacked = stacked.astype(np.int64)
result = stacked.loc[df.index]
assert_frame_equal(result, df)
# From a series
s = df['w']
result = s.unstack(['x', 'y'], fill_value=0)
expected = unstacked['w']
assert_frame_equal(result, expected)
def test_unstack_fill_frame(self):
# From a dataframe
rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
df.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
result = df.unstack(fill_value=-1)
rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
expected.columns = MultiIndex.from_tuples(
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
assert_frame_equal(result, expected)
# From a mixed type dataframe
df['A'] = df['A'].astype(np.int16)
df['B'] = df['B'].astype(np.float64)
result = df.unstack(fill_value=-1)
expected['A'] = expected['A'].astype(np.int16)
expected['B'] = expected['B'].astype(np.float64)
assert_frame_equal(result, expected)
# From a dataframe with incorrect data type for fill_value
result = df.unstack(fill_value=0.5)
rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
expected.columns = MultiIndex.from_tuples(
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
assert_frame_equal(result, expected)
def test_unstack_fill_frame_datetime(self):
# Test unstacking with date times
dv = pd.date_range('2012-01-01', periods=4).values
data = Series(dv)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
result = data.unstack()
expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]],
'b': [dv[1], dv[2], pd.NaT]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
result = data.unstack(fill_value=dv[0])
expected = DataFrame({'a': [dv[0], dv[0], dv[3]],
'b': [dv[1], dv[2], dv[0]]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
def test_unstack_fill_frame_timedelta(self):
# Test unstacking with time deltas
td = [Timedelta(days=i) for i in range(4)]
data = Series(td)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
result = data.unstack()
expected = DataFrame({'a': [td[0], pd.NaT, td[3]],
'b': [td[1], td[2], pd.NaT]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
result = data.unstack(fill_value=td[1])
expected = DataFrame({'a': [td[0], td[1], td[3]],
'b': [td[1], td[2], td[1]]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
def test_unstack_fill_frame_period(self):
# Test unstacking with period
periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'),
Period('2012-04')]
data = Series(periods)
data.index = MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
result = data.unstack()
expected = DataFrame({'a': [periods[0], None, periods[3]],
'b': [periods[1], periods[2], None]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
result = data.unstack(fill_value=periods[1])
expected = DataFrame({'a': [periods[0], periods[1], periods[3]],
'b': [periods[1], periods[2], periods[1]]},
index=['x', 'y', 'z'])
assert_frame_equal(result, expected)
def test_unstack_fill_frame_categorical(self):
# Test unstacking with categorical
data = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
data.index = pd.MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
# By default missing values will be NaN
result = data.unstack()
expected = DataFrame({'a': pd.Categorical(list('axa'),
categories=list('abc')),
'b': pd.Categorical(list('bcx'),
categories=list('abc'))},
index=list('xyz'))
assert_frame_equal(result, expected)
# Fill with non-category results in NaN entries similar to above
result = data.unstack(fill_value='d')
assert_frame_equal(result, expected)
# Fill with category value replaces missing values as expected
result = data.unstack(fill_value='c')
expected = DataFrame({'a': pd.Categorical(list('aca'),
categories=list('abc')),
'b': pd.Categorical(list('bcc'),
categories=list('abc'))},
index=list('xyz'))
assert_frame_equal(result, expected)
def test_unstack_preserve_dtypes(self):
# Checks fix for #11847
df = pd.DataFrame(dict(state=['IL', 'MI', 'NC'],
index=['a', 'b', 'c'],
some_categories=pd.Series(['a', 'b', 'c']
).astype('category'),
A=np.random.rand(3),
B=1,
C='foo',
D=pd.Timestamp('20010102'),
E=pd.Series([1.0, 50.0, 100.0]
).astype('float32'),
F=pd.Series([3.0, 4.0, 5.0]).astype('float64'),
G=False,
H=pd.Series([1, 200, 923442], dtype='int8')))
def unstack_and_compare(df, column_name):
unstacked1 = df.unstack([column_name])
unstacked2 = df.unstack(column_name)
assert_frame_equal(unstacked1, unstacked2)
df1 = df.set_index(['state', 'index'])
unstack_and_compare(df1, 'index')
df1 = df.set_index(['state', 'some_categories'])
unstack_and_compare(df1, 'some_categories')
df1 = df.set_index(['F', 'C'])
unstack_and_compare(df1, 'F')
df1 = df.set_index(['G', 'B', 'state'])
unstack_and_compare(df1, 'B')
df1 = df.set_index(['E', 'A'])
unstack_and_compare(df1, 'E')
df1 = df.set_index(['state', 'index'])
s = df1['A']
unstack_and_compare(s, 'index')
def test_stack_ints(self):
columns = MultiIndex.from_tuples(list(itertools.product(range(3),
repeat=3)))
df = DataFrame(np.random.randn(30, 27), columns=columns)
assert_frame_equal(df.stack(level=[1, 2]),
df.stack(level=1).stack(level=1))
assert_frame_equal(df.stack(level=[-2, -1]),
df.stack(level=1).stack(level=1))
df_named = df.copy()
df_named.columns.set_names(range(3), inplace=True)
assert_frame_equal(df_named.stack(level=[1, 2]),
df_named.stack(level=1).stack(level=1))
def test_stack_mixed_levels(self):
columns = MultiIndex.from_tuples(
[('A', 'cat', 'long'), ('B', 'cat', 'long'),
('A', 'dog', 'short'), ('B', 'dog', 'short')],
names=['exp', 'animal', 'hair_length']
)
df = DataFrame(randn(4, 4), columns=columns)
animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
# GH #8584: Need to check that stacking works when a number
# is passed that is both a level name and in the range of
# the level numbers
df2 = df.copy()
df2.columns.names = ['exp', 'animal', 1]
assert_frame_equal(df2.stack(level=['animal', 1]),
animal_hair_stacked, check_names=False)
assert_frame_equal(df2.stack(level=['exp', 1]),
exp_hair_stacked, check_names=False)
# When mixed types are passed and the ints are not level
# names, raise
pytest.raises(ValueError, df2.stack, level=['animal', 0])
# GH #8584: Having 0 in the level names could raise a
# strange error about lexsort depth
df3 = df.copy()
df3.columns.names = ['exp', 'animal', 0]
assert_frame_equal(df3.stack(level=['animal', 0]),
animal_hair_stacked, check_names=False)
def test_stack_int_level_names(self):
columns = MultiIndex.from_tuples(
[('A', 'cat', 'long'), ('B', 'cat', 'long'),
('A', 'dog', 'short'), ('B', 'dog', 'short')],
names=['exp', 'animal', 'hair_length']
)
df = DataFrame(randn(4, 4), columns=columns)
exp_animal_stacked = df.stack(level=['exp', 'animal'])
animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
df2 = df.copy()
df2.columns.names = [0, 1, 2]
assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked,
check_names=False)
assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked,
check_names=False)
assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked,
check_names=False)
# Out-of-order int column names
df3 = df.copy()
df3.columns.names = [2, 0, 1]
assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked,
check_names=False)
assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked,
check_names=False)
assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked,
check_names=False)
def test_unstack_bool(self):
df = DataFrame([False, False],
index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]),
columns=['col'])
rs = df.unstack()
xp = DataFrame(np.array([[False, np.nan], [np.nan, False]],
dtype=object),
index=['a', 'b'],
columns=MultiIndex.from_arrays([['col', 'col'],
['c', 'l']]))
assert_frame_equal(rs, xp)
def test_unstack_level_binding(self):
# GH9856
mi = pd.MultiIndex(
levels=[[u('foo'), u('bar')], [u('one'), u('two')],
[u('a'), u('b')]],
labels=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]],
names=[u('first'), u('second'), u('third')])
s = pd.Series(0, index=mi)
result = s.unstack([1, 2]).stack(0)
expected_mi = pd.MultiIndex(
levels=[['foo', 'bar'], ['one', 'two']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=['first', 'second'])
expected = pd.DataFrame(np.array([[np.nan, 0],
[0, np.nan],
[np.nan, 0],
[0, np.nan]],
dtype=np.float64),
index=expected_mi,
columns=pd.Index(['a', 'b'], name='third'))
assert_frame_equal(result, expected)
def test_unstack_to_series(self):
# check reversibility
data = self.frame.unstack()
assert isinstance(data, Series)
undo = data.unstack().T
assert_frame_equal(undo, self.frame)
# check NA handling
data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]})
data.index = Index(['a', 'b', 'c'])
result = data.unstack()
midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']],
labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx)
assert_series_equal(result, expected)
# check composability of unstack
old_data = data.copy()
for _ in range(4):
data = data.unstack()
assert_frame_equal(old_data, data)
def test_unstack_dtypes(self):
# GH 2929
rows = [[1, 1, 3, 4],
[1, 2, 3, 4],
[2, 1, 3, 4],
[2, 2, 3, 4]]
df = DataFrame(rows, columns=list('ABCD'))
result = df.get_dtype_counts()
expected = Series({'int64': 4})
assert_series_equal(result, expected)
# single dtype
df2 = df.set_index(['A', 'B'])
df3 = df2.unstack('B')
result = df3.get_dtype_counts()
expected = Series({'int64': 4})
assert_series_equal(result, expected)
# mixed
df2 = df.set_index(['A', 'B'])
df2['C'] = 3.
df3 = df2.unstack('B')
result = df3.get_dtype_counts()
expected = Series({'int64': 2, 'float64': 2})
assert_series_equal(result, expected)
df2['D'] = 'foo'
df3 = df2.unstack('B')
result = df3.get_dtype_counts()
expected = Series({'float64': 2, 'object': 2})
assert_series_equal(result, expected)
# GH7405
for c, d in (np.zeros(5), np.zeros(5)), \
(np.arange(5, dtype='f8'), np.arange(5, 10, dtype='f8')):
df = DataFrame({'A': ['a'] * 5, 'C': c, 'D': d,
'B': pd.date_range('2012-01-01', periods=5)})
right = df.iloc[:3].copy(deep=True)
df = df.set_index(['A', 'B'])
df['D'] = df['D'].astype('int64')
left = df.iloc[:3].unstack(0)
right = right.set_index(['A', 'B']).unstack(0)
right[('D', 'a')] = right[('D', 'a')].astype('int64')
assert left.shape == (3, 2)
tm.assert_frame_equal(left, right)
def test_unstack_non_unique_index_names(self):
idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')],
names=['c1', 'c1'])
df = DataFrame([1, 2], index=idx)
with pytest.raises(ValueError):
df.unstack('c1')
with pytest.raises(ValueError):
df.T.stack('c1')
def test_unstack_unused_levels(self):
# GH 17845: unused labels in index make unstack() cast int to float
idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1]
df = pd.DataFrame([[1, 0]] * 3, index=idx)
result = df.unstack()
exp_col = pd.MultiIndex.from_product([[0, 1], ['A', 'B', 'C']])
expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=['a'],
columns=exp_col)
tm.assert_frame_equal(result, expected)
assert((result.columns.levels[1] == idx.levels[1]).all())
# Unused items on both levels
levels = [[0, 1, 7], [0, 1, 2, 3]]
labels = [[0, 0, 1, 1], [0, 2, 0, 2]]
idx = pd.MultiIndex(levels, labels)
block = np.arange(4).reshape(2, 2)
df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx)
result = df.unstack()
expected = pd.DataFrame(np.concatenate([block * 2, block * 2 + 1],
axis=1),
columns=idx)
tm.assert_frame_equal(result, expected)
assert((result.columns.levels[1] == idx.levels[1]).all())
# With mixed dtype and NaN
levels = [['a', 2, 'c'], [1, 3, 5, 7]]
labels = [[0, -1, 1, 1], [0, 2, -1, 2]]
idx = pd.MultiIndex(levels, labels)
data = np.arange(8)
df = pd.DataFrame(data.reshape(4, 2), index=idx)
cases = ((0, [13, 16, 6, 9, 2, 5, 8, 11],
[np.nan, 'a', 2], [np.nan, 5, 1]),
(1, [8, 11, 1, 4, 12, 15, 13, 16],
[np.nan, 5, 1], [np.nan, 'a', 2]))
for level, idces, col_level, idx_level in cases:
result = df.unstack(level=level)
exp_data = np.zeros(18) * np.nan
exp_data[idces] = data
cols = pd.MultiIndex.from_product([[0, 1], col_level])
expected = pd.DataFrame(exp_data.reshape(3, 6),
index=idx_level, columns=cols)
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("cols", [['A', 'C'], slice(None)])
def test_unstack_unused_level(self, cols):
# GH 18562 : unused labels on the unstacked level
df = pd.DataFrame([[2010, 'a', 'I'],
[2011, 'b', 'II']],
columns=['A', 'B', 'C'])
ind = df.set_index(['A', 'B', 'C'], drop=False)
selection = ind.loc[(slice(None), slice(None), 'I'), cols]
result = selection.unstack()
expected = ind.iloc[[0]][cols]
expected.columns = MultiIndex.from_product([expected.columns, ['I']],
names=[None, 'C'])
expected.index = expected.index.droplevel('C')
tm.assert_frame_equal(result, expected)
def test_unstack_nan_index(self): # GH7466
cast = lambda val: '{0:1}'.format('' if val != val else val)
nan = np.nan
def verify(df):
mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
rows, cols = df.notna().values.nonzero()
for i, j in zip(rows, cols):
left = sorted(df.iloc[i, j].split('.'))
right = mk_list(df.index[i]) + mk_list(df.columns[j])
right = sorted(list(map(cast, right)))
assert left == right
df = DataFrame({'jim': ['a', 'b', nan, 'd'],
'joe': ['w', 'x', 'y', 'z'],
'jolie': ['a.w', 'b.x', ' .y', 'd.z']})
left = df.set_index(['jim', 'joe']).unstack()['jolie']
right = df.set_index(['joe', 'jim']).unstack()['jolie'].T
assert_frame_equal(left, right)
for idx in itertools.permutations(df.columns[:2]):
mi = df.set_index(list(idx))
for lev in range(2):
udf = mi.unstack(level=lev)
assert udf.notna().values.sum() == len(df)
verify(udf['jolie'])
df = DataFrame({'1st': ['d'] * 3 + [nan] * 5 + ['a'] * 2 +
['c'] * 3 + ['e'] * 2 + ['b'] * 5,
'2nd': ['y'] * 2 + ['w'] * 3 + [nan] * 3 +
['z'] * 4 + [nan] * 3 + ['x'] * 3 + [nan] * 2,
'3rd': [67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59,
50, 62, 59, 76, 52, 14, 53, 60, 51]})
df['4th'], df['5th'] = \
df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \
df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1)
for idx in itertools.permutations(['1st', '2nd', '3rd']):
mi = df.set_index(list(idx))
for lev in range(3):
udf = mi.unstack(level=lev)
assert udf.notna().values.sum() == 2 * len(df)
for col in ['4th', '5th']:
verify(udf[col])
# GH7403
df = pd.DataFrame(
{'A': list('aaaabbbb'), 'B': range(8), 'C': range(8)})
df.iloc[3, 1] = np.NaN
left = df.set_index(['A', 'B']).unstack(0)
vals = [[3, 0, 1, 2, nan, nan, nan, nan],
[nan, nan, nan, nan, 4, 5, 6, 7]]
vals = list(map(list, zip(*vals)))
idx = Index([nan, 0, 1, 2, 4, 5, 6, 7], name='B')
cols = MultiIndex(levels=[['C'], ['a', 'b']],
labels=[[0, 0], [0, 1]],
names=[None, 'A'])
right = DataFrame(vals, columns=cols, index=idx)
assert_frame_equal(left, right)
df = DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
'C': range(8)})
df.iloc[2, 1] = np.NaN
left = df.set_index(['A', 'B']).unstack(0)
vals = [[2, nan], [0, 4], [1, 5], [nan, 6], [3, 7]]
cols = MultiIndex(levels=[['C'], ['a', 'b']],
labels=[[0, 0], [0, 1]],
names=[None, 'A'])
idx = Index([nan, 0, 1, 2, 3], name='B')
right = DataFrame(vals, columns=cols, index=idx)
assert_frame_equal(left, right)
df = pd.DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
'C': range(8)})
df.iloc[3, 1] = np.NaN
left = df.set_index(['A', 'B']).unstack(0)
vals = [[3, nan], [0, 4], [1, 5], [2, 6], [nan, 7]]
cols = MultiIndex(levels=[['C'], ['a', 'b']],
labels=[[0, 0], [0, 1]],
names=[None, 'A'])
idx = Index([nan, 0, 1, 2, 3], name='B')
right = DataFrame(vals, columns=cols, index=idx)
assert_frame_equal(left, right)
# GH7401
df = pd.DataFrame({'A': list('aaaaabbbbb'),
'B': (date_range('2012-01-01', periods=5)
.tolist() * 2),
'C': np.arange(10)})
df.iloc[3, 1] = np.NaN
left = df.set_index(['A', 'B']).unstack()
vals = np.array([[3, 0, 1, 2, nan, 4], [nan, 5, 6, 7, 8, 9]])
idx = Index(['a', 'b'], name='A')
cols = MultiIndex(levels=[['C'], date_range('2012-01-01', periods=5)],
labels=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
names=[None, 'B'])
right = DataFrame(vals, columns=cols, index=idx)
assert_frame_equal(left, right)
# GH4862
vals = [['Hg', nan, nan, 680585148],
['U', 0.0, nan, 680585148],
['Pb', 7.07e-06, nan, 680585148],
['Sn', 2.3614e-05, 0.0133, 680607017],
['Ag', 0.0, 0.0133, 680607017],
['Hg', -0.00015, 0.0133, 680607017]]
df = DataFrame(vals, columns=['agent', 'change', 'dosage', 's_id'],
index=[17263, 17264, 17265, 17266, 17267, 17268])
left = df.copy().set_index(['s_id', 'dosage', 'agent']).unstack()
vals = [[nan, nan, 7.07e-06, nan, 0.0],
[0.0, -0.00015, nan, 2.3614e-05, nan]]
idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]],
labels=[[0, 1], [-1, 0]],
names=['s_id', 'dosage'])
cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']],
labels=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
names=[None, 'agent'])
right = DataFrame(vals, columns=cols, index=idx)
assert_frame_equal(left, right)
left = df.loc[17264:].copy().set_index(['s_id', 'dosage', 'agent'])
assert_frame_equal(left.unstack(), right)
# GH9497 - multiple unstack with nulls
df = DataFrame({'1st': [1, 2, 1, 2, 1, 2],
'2nd': pd.date_range('2014-02-01', periods=6,
freq='D'),
'jim': 100 + np.arange(6),
'joe': (np.random.randn(6) * 10).round(2)})
df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02')
df.loc[1, '2nd'] = df.loc[3, '2nd'] = nan
df.loc[1, '3rd'] = df.loc[4, '3rd'] = nan
left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd'])
assert left.notna().values.sum() == 2 * len(df)
for col in ['jim', 'joe']:
for _, r in df.iterrows():
key = r['1st'], (col, r['2nd'], r['3rd'])
assert r[col] == left.loc[key]
def test_stack_datetime_column_multiIndex(self):
# GH 8039
t = datetime(2014, 1, 1)
df = DataFrame(
[1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, 'A', 'B')]))
result = df.stack()
eidx = MultiIndex.from_product([(0, 1, 2, 3), ('B',)])
ecols = MultiIndex.from_tuples([(t, 'A')])
expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols)
assert_frame_equal(result, expected)
def test_stack_partial_multiIndex(self):
# GH 8844
def _test_stack_with_multiindex(multiindex):
df = DataFrame(np.arange(3 * len(multiindex))
.reshape(3, len(multiindex)),
columns=multiindex)
for level in (-1, 0, 1, [0, 1], [1, 0]):
result = df.stack(level=level, dropna=False)
if isinstance(level, int):
# Stacking a single level should not make any all-NaN rows,
# so df.stack(level=level, dropna=False) should be the same
# as df.stack(level=level, dropna=True).
expected = df.stack(level=level, dropna=True)
if isinstance(expected, Series):
assert_series_equal(result, expected)
else:
assert_frame_equal(result, expected)
df.columns = MultiIndex.from_tuples(df.columns.get_values(),
names=df.columns.names)
expected = df.stack(level=level, dropna=False)
if isinstance(expected, Series):
assert_series_equal(result, expected)
else:
assert_frame_equal(result, expected)
full_multiindex = MultiIndex.from_tuples([('B', 'x'), ('B', 'z'),
('A', 'y'),
('C', 'x'), ('C', 'u')],
names=['Upper', 'Lower'])
for multiindex_columns in ([0, 1, 2, 3, 4],
[0, 1, 2, 3], [0, 1, 2, 4],
[0, 1, 2], [1, 2, 3], [2, 3, 4],
[0, 1], [0, 2], [0, 3],
[0], [2], [4]):
_test_stack_with_multiindex(full_multiindex[multiindex_columns])
if len(multiindex_columns) > 1:
multiindex_columns.reverse()
_test_stack_with_multiindex(
full_multiindex[multiindex_columns])
df = DataFrame(np.arange(6).reshape(2, 3),
columns=full_multiindex[[0, 1, 3]])
result = df.stack(dropna=False)
expected = DataFrame([[0, 2], [1, nan], [3, 5], [4, nan]],
index=MultiIndex(
levels=[[0, 1], ['u', 'x', 'y', 'z']],
labels=[[0, 0, 1, 1],
[1, 3, 1, 3]],
names=[None, 'Lower']),
columns=Index(['B', 'C'], name='Upper'),
dtype=df.dtypes[0])
assert_frame_equal(result, expected)
def test_stack_preserve_categorical_dtype(self):
# GH13854
for ordered in [False, True]:
for labels in [list("yxz"), list("yxy")]:
cidx = pd.CategoricalIndex(labels, categories=list("xyz"),
ordered=ordered)
df = DataFrame([[10, 11, 12]], columns=cidx)
result = df.stack()
# `MutliIndex.from_product` preserves categorical dtype -
# it's tested elsewhere.
midx = pd.MultiIndex.from_product([df.index, cidx])
expected = Series([10, 11, 12], index=midx)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize("level", [0, 'baz'])
def test_unstack_swaplevel_sortlevel(self, level):
# GH 20994
mi = pd.MultiIndex.from_product([[0], ['d', 'c']],
names=['bar', 'baz'])
df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=['B', 'A'])
df.columns.name = 'foo'
expected = pd.DataFrame([
[3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples([
('c', 'A'), ('c', 'B'), ('d', 'A'), ('d', 'B')], names=[
'baz', 'foo']))
expected.index.name = 'bar'
result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level)
tm.assert_frame_equal(result, expected)
def test_unstack_fill_frame_object():
# GH12815 Test unstacking with object.
data = pd.Series(['a', 'b', 'c', 'a'], dtype='object')
data.index = pd.MultiIndex.from_tuples(
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
# By default missing values will be NaN
result = data.unstack()
expected = pd.DataFrame(
{'a': ['a', np.nan, 'a'], 'b': ['b', 'c', np.nan]},
index=list('xyz')
)
assert_frame_equal(result, expected)
# Fill with any value replaces missing values as expected
result = data.unstack(fill_value='d')
expected = pd.DataFrame(
{'a': ['a', 'd', 'a'], 'b': ['b', 'c', 'd']},
index=list('xyz')
)
assert_frame_equal(result, expected)