912 lines
36 KiB
Python
912 lines
36 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
from __future__ import print_function
|
|
|
|
from warnings import catch_warnings
|
|
from datetime import datetime
|
|
|
|
import itertools
|
|
import pytest
|
|
|
|
from numpy.random import randn
|
|
from numpy import nan
|
|
import numpy as np
|
|
|
|
from pandas.compat import u
|
|
from pandas import (DataFrame, Index, Series, MultiIndex, date_range,
|
|
Timedelta, Period)
|
|
import pandas as pd
|
|
|
|
from pandas.util.testing import assert_series_equal, assert_frame_equal
|
|
|
|
import pandas.util.testing as tm
|
|
|
|
from pandas.tests.frame.common import TestData
|
|
|
|
|
|
class TestDataFrameReshape(TestData):
|
|
|
|
def test_pivot(self):
|
|
data = {
|
|
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
|
|
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
|
|
'values': [1., 2., 3., 3., 2., 1.]
|
|
}
|
|
|
|
frame = DataFrame(data)
|
|
pivoted = frame.pivot(
|
|
index='index', columns='columns', values='values')
|
|
|
|
expected = DataFrame({
|
|
'One': {'A': 1., 'B': 2., 'C': 3.},
|
|
'Two': {'A': 1., 'B': 2., 'C': 3.}
|
|
})
|
|
|
|
expected.index.name, expected.columns.name = 'index', 'columns'
|
|
tm.assert_frame_equal(pivoted, expected)
|
|
|
|
# name tracking
|
|
assert pivoted.index.name == 'index'
|
|
assert pivoted.columns.name == 'columns'
|
|
|
|
# don't specify values
|
|
pivoted = frame.pivot(index='index', columns='columns')
|
|
assert pivoted.index.name == 'index'
|
|
assert pivoted.columns.names == (None, 'columns')
|
|
|
|
with catch_warnings(record=True):
|
|
# pivot multiple columns
|
|
wp = tm.makePanel()
|
|
lp = wp.to_frame()
|
|
df = lp.reset_index()
|
|
tm.assert_frame_equal(df.pivot('major', 'minor'), lp.unstack())
|
|
|
|
def test_pivot_duplicates(self):
|
|
data = DataFrame({'a': ['bar', 'bar', 'foo', 'foo', 'foo'],
|
|
'b': ['one', 'two', 'one', 'one', 'two'],
|
|
'c': [1., 2., 3., 3., 4.]})
|
|
with tm.assert_raises_regex(ValueError, 'duplicate entries'):
|
|
data.pivot('a', 'b', 'c')
|
|
|
|
def test_pivot_empty(self):
|
|
df = DataFrame({}, columns=['a', 'b', 'c'])
|
|
result = df.pivot('a', 'b', 'c')
|
|
expected = DataFrame({})
|
|
tm.assert_frame_equal(result, expected, check_names=False)
|
|
|
|
def test_pivot_integer_bug(self):
|
|
df = DataFrame(data=[("A", "1", "A1"), ("B", "2", "B2")])
|
|
|
|
result = df.pivot(index=1, columns=0, values=2)
|
|
repr(result)
|
|
tm.assert_index_equal(result.columns, Index(['A', 'B'], name=0))
|
|
|
|
def test_pivot_index_none(self):
|
|
# gh-3962
|
|
data = {
|
|
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
|
|
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
|
|
'values': [1., 2., 3., 3., 2., 1.]
|
|
}
|
|
|
|
frame = DataFrame(data).set_index('index')
|
|
result = frame.pivot(columns='columns', values='values')
|
|
expected = DataFrame({
|
|
'One': {'A': 1., 'B': 2., 'C': 3.},
|
|
'Two': {'A': 1., 'B': 2., 'C': 3.}
|
|
})
|
|
|
|
expected.index.name, expected.columns.name = 'index', 'columns'
|
|
assert_frame_equal(result, expected)
|
|
|
|
# omit values
|
|
result = frame.pivot(columns='columns')
|
|
|
|
expected.columns = pd.MultiIndex.from_tuples([('values', 'One'),
|
|
('values', 'Two')],
|
|
names=[None, 'columns'])
|
|
expected.index.name = 'index'
|
|
tm.assert_frame_equal(result, expected, check_names=False)
|
|
assert result.index.name == 'index'
|
|
assert result.columns.names == (None, 'columns')
|
|
expected.columns = expected.columns.droplevel(0)
|
|
result = frame.pivot(columns='columns', values='values')
|
|
|
|
expected.columns.name = 'columns'
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_stack_unstack(self):
|
|
df = self.frame.copy()
|
|
df[:] = np.arange(np.prod(df.shape)).reshape(df.shape)
|
|
|
|
stacked = df.stack()
|
|
stacked_df = DataFrame({'foo': stacked, 'bar': stacked})
|
|
|
|
unstacked = stacked.unstack()
|
|
unstacked_df = stacked_df.unstack()
|
|
|
|
assert_frame_equal(unstacked, df)
|
|
assert_frame_equal(unstacked_df['bar'], df)
|
|
|
|
unstacked_cols = stacked.unstack(0)
|
|
unstacked_cols_df = stacked_df.unstack(0)
|
|
assert_frame_equal(unstacked_cols.T, df)
|
|
assert_frame_equal(unstacked_cols_df['bar'].T, df)
|
|
|
|
def test_stack_mixed_level(self):
|
|
# GH 18310
|
|
levels = [range(3), [3, 'a', 'b'], [1, 2]]
|
|
|
|
# flat columns:
|
|
df = DataFrame(1, index=levels[0], columns=levels[1])
|
|
result = df.stack()
|
|
expected = Series(1, index=MultiIndex.from_product(levels[:2]))
|
|
assert_series_equal(result, expected)
|
|
|
|
# MultiIndex columns:
|
|
df = DataFrame(1, index=levels[0],
|
|
columns=MultiIndex.from_product(levels[1:]))
|
|
result = df.stack(1)
|
|
expected = DataFrame(1, index=MultiIndex.from_product([levels[0],
|
|
levels[2]]),
|
|
columns=levels[1])
|
|
assert_frame_equal(result, expected)
|
|
|
|
# as above, but used labels in level are actually of homogeneous type
|
|
result = df[['a', 'b']].stack(1)
|
|
expected = expected[['a', 'b']]
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_unstack_fill(self):
|
|
|
|
# GH #9746: fill_value keyword argument for Series
|
|
# and DataFrame unstack
|
|
|
|
# From a series
|
|
data = Series([1, 2, 4, 5], dtype=np.int16)
|
|
data.index = MultiIndex.from_tuples(
|
|
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
|
|
|
result = data.unstack(fill_value=-1)
|
|
expected = DataFrame({'a': [1, -1, 5], 'b': [2, 4, -1]},
|
|
index=['x', 'y', 'z'], dtype=np.int16)
|
|
assert_frame_equal(result, expected)
|
|
|
|
# From a series with incorrect data type for fill_value
|
|
result = data.unstack(fill_value=0.5)
|
|
expected = DataFrame({'a': [1, 0.5, 5], 'b': [2, 4, 0.5]},
|
|
index=['x', 'y', 'z'], dtype=np.float)
|
|
assert_frame_equal(result, expected)
|
|
|
|
# GH #13971: fill_value when unstacking multiple levels:
|
|
df = DataFrame({'x': ['a', 'a', 'b'],
|
|
'y': ['j', 'k', 'j'],
|
|
'z': [0, 1, 2],
|
|
'w': [0, 1, 2]}).set_index(['x', 'y', 'z'])
|
|
unstacked = df.unstack(['x', 'y'], fill_value=0)
|
|
key = ('w', 'b', 'j')
|
|
expected = unstacked[key]
|
|
result = pd.Series([0, 0, 2], index=unstacked.index, name=key)
|
|
assert_series_equal(result, expected)
|
|
|
|
stacked = unstacked.stack(['x', 'y'])
|
|
stacked.index = stacked.index.reorder_levels(df.index.names)
|
|
# Workaround for GH #17886 (unnecessarily casts to float):
|
|
stacked = stacked.astype(np.int64)
|
|
result = stacked.loc[df.index]
|
|
assert_frame_equal(result, df)
|
|
|
|
# From a series
|
|
s = df['w']
|
|
result = s.unstack(['x', 'y'], fill_value=0)
|
|
expected = unstacked['w']
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_unstack_fill_frame(self):
|
|
|
|
# From a dataframe
|
|
rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
|
|
df = DataFrame(rows, columns=list('AB'), dtype=np.int32)
|
|
df.index = MultiIndex.from_tuples(
|
|
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
|
|
|
result = df.unstack(fill_value=-1)
|
|
|
|
rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
|
|
expected = DataFrame(rows, index=list('xyz'), dtype=np.int32)
|
|
expected.columns = MultiIndex.from_tuples(
|
|
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
|
|
assert_frame_equal(result, expected)
|
|
|
|
# From a mixed type dataframe
|
|
df['A'] = df['A'].astype(np.int16)
|
|
df['B'] = df['B'].astype(np.float64)
|
|
|
|
result = df.unstack(fill_value=-1)
|
|
expected['A'] = expected['A'].astype(np.int16)
|
|
expected['B'] = expected['B'].astype(np.float64)
|
|
assert_frame_equal(result, expected)
|
|
|
|
# From a dataframe with incorrect data type for fill_value
|
|
result = df.unstack(fill_value=0.5)
|
|
|
|
rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
|
|
expected = DataFrame(rows, index=list('xyz'), dtype=np.float)
|
|
expected.columns = MultiIndex.from_tuples(
|
|
[('A', 'a'), ('A', 'b'), ('B', 'a'), ('B', 'b')])
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_unstack_fill_frame_datetime(self):
|
|
|
|
# Test unstacking with date times
|
|
dv = pd.date_range('2012-01-01', periods=4).values
|
|
data = Series(dv)
|
|
data.index = MultiIndex.from_tuples(
|
|
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
|
|
|
result = data.unstack()
|
|
expected = DataFrame({'a': [dv[0], pd.NaT, dv[3]],
|
|
'b': [dv[1], dv[2], pd.NaT]},
|
|
index=['x', 'y', 'z'])
|
|
assert_frame_equal(result, expected)
|
|
|
|
result = data.unstack(fill_value=dv[0])
|
|
expected = DataFrame({'a': [dv[0], dv[0], dv[3]],
|
|
'b': [dv[1], dv[2], dv[0]]},
|
|
index=['x', 'y', 'z'])
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_unstack_fill_frame_timedelta(self):
|
|
|
|
# Test unstacking with time deltas
|
|
td = [Timedelta(days=i) for i in range(4)]
|
|
data = Series(td)
|
|
data.index = MultiIndex.from_tuples(
|
|
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
|
|
|
result = data.unstack()
|
|
expected = DataFrame({'a': [td[0], pd.NaT, td[3]],
|
|
'b': [td[1], td[2], pd.NaT]},
|
|
index=['x', 'y', 'z'])
|
|
assert_frame_equal(result, expected)
|
|
|
|
result = data.unstack(fill_value=td[1])
|
|
expected = DataFrame({'a': [td[0], td[1], td[3]],
|
|
'b': [td[1], td[2], td[1]]},
|
|
index=['x', 'y', 'z'])
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_unstack_fill_frame_period(self):
|
|
|
|
# Test unstacking with period
|
|
periods = [Period('2012-01'), Period('2012-02'), Period('2012-03'),
|
|
Period('2012-04')]
|
|
data = Series(periods)
|
|
data.index = MultiIndex.from_tuples(
|
|
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
|
|
|
result = data.unstack()
|
|
expected = DataFrame({'a': [periods[0], None, periods[3]],
|
|
'b': [periods[1], periods[2], None]},
|
|
index=['x', 'y', 'z'])
|
|
assert_frame_equal(result, expected)
|
|
|
|
result = data.unstack(fill_value=periods[1])
|
|
expected = DataFrame({'a': [periods[0], periods[1], periods[3]],
|
|
'b': [periods[1], periods[2], periods[1]]},
|
|
index=['x', 'y', 'z'])
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_unstack_fill_frame_categorical(self):
|
|
|
|
# Test unstacking with categorical
|
|
data = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
|
|
data.index = pd.MultiIndex.from_tuples(
|
|
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
|
|
|
# By default missing values will be NaN
|
|
result = data.unstack()
|
|
expected = DataFrame({'a': pd.Categorical(list('axa'),
|
|
categories=list('abc')),
|
|
'b': pd.Categorical(list('bcx'),
|
|
categories=list('abc'))},
|
|
index=list('xyz'))
|
|
assert_frame_equal(result, expected)
|
|
|
|
# Fill with non-category results in NaN entries similar to above
|
|
result = data.unstack(fill_value='d')
|
|
assert_frame_equal(result, expected)
|
|
|
|
# Fill with category value replaces missing values as expected
|
|
result = data.unstack(fill_value='c')
|
|
expected = DataFrame({'a': pd.Categorical(list('aca'),
|
|
categories=list('abc')),
|
|
'b': pd.Categorical(list('bcc'),
|
|
categories=list('abc'))},
|
|
index=list('xyz'))
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_unstack_preserve_dtypes(self):
|
|
# Checks fix for #11847
|
|
df = pd.DataFrame(dict(state=['IL', 'MI', 'NC'],
|
|
index=['a', 'b', 'c'],
|
|
some_categories=pd.Series(['a', 'b', 'c']
|
|
).astype('category'),
|
|
A=np.random.rand(3),
|
|
B=1,
|
|
C='foo',
|
|
D=pd.Timestamp('20010102'),
|
|
E=pd.Series([1.0, 50.0, 100.0]
|
|
).astype('float32'),
|
|
F=pd.Series([3.0, 4.0, 5.0]).astype('float64'),
|
|
G=False,
|
|
H=pd.Series([1, 200, 923442], dtype='int8')))
|
|
|
|
def unstack_and_compare(df, column_name):
|
|
unstacked1 = df.unstack([column_name])
|
|
unstacked2 = df.unstack(column_name)
|
|
assert_frame_equal(unstacked1, unstacked2)
|
|
|
|
df1 = df.set_index(['state', 'index'])
|
|
unstack_and_compare(df1, 'index')
|
|
|
|
df1 = df.set_index(['state', 'some_categories'])
|
|
unstack_and_compare(df1, 'some_categories')
|
|
|
|
df1 = df.set_index(['F', 'C'])
|
|
unstack_and_compare(df1, 'F')
|
|
|
|
df1 = df.set_index(['G', 'B', 'state'])
|
|
unstack_and_compare(df1, 'B')
|
|
|
|
df1 = df.set_index(['E', 'A'])
|
|
unstack_and_compare(df1, 'E')
|
|
|
|
df1 = df.set_index(['state', 'index'])
|
|
s = df1['A']
|
|
unstack_and_compare(s, 'index')
|
|
|
|
def test_stack_ints(self):
|
|
columns = MultiIndex.from_tuples(list(itertools.product(range(3),
|
|
repeat=3)))
|
|
df = DataFrame(np.random.randn(30, 27), columns=columns)
|
|
|
|
assert_frame_equal(df.stack(level=[1, 2]),
|
|
df.stack(level=1).stack(level=1))
|
|
assert_frame_equal(df.stack(level=[-2, -1]),
|
|
df.stack(level=1).stack(level=1))
|
|
|
|
df_named = df.copy()
|
|
df_named.columns.set_names(range(3), inplace=True)
|
|
|
|
assert_frame_equal(df_named.stack(level=[1, 2]),
|
|
df_named.stack(level=1).stack(level=1))
|
|
|
|
def test_stack_mixed_levels(self):
|
|
columns = MultiIndex.from_tuples(
|
|
[('A', 'cat', 'long'), ('B', 'cat', 'long'),
|
|
('A', 'dog', 'short'), ('B', 'dog', 'short')],
|
|
names=['exp', 'animal', 'hair_length']
|
|
)
|
|
df = DataFrame(randn(4, 4), columns=columns)
|
|
|
|
animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
|
|
exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
|
|
|
|
# GH #8584: Need to check that stacking works when a number
|
|
# is passed that is both a level name and in the range of
|
|
# the level numbers
|
|
df2 = df.copy()
|
|
df2.columns.names = ['exp', 'animal', 1]
|
|
assert_frame_equal(df2.stack(level=['animal', 1]),
|
|
animal_hair_stacked, check_names=False)
|
|
assert_frame_equal(df2.stack(level=['exp', 1]),
|
|
exp_hair_stacked, check_names=False)
|
|
|
|
# When mixed types are passed and the ints are not level
|
|
# names, raise
|
|
pytest.raises(ValueError, df2.stack, level=['animal', 0])
|
|
|
|
# GH #8584: Having 0 in the level names could raise a
|
|
# strange error about lexsort depth
|
|
df3 = df.copy()
|
|
df3.columns.names = ['exp', 'animal', 0]
|
|
assert_frame_equal(df3.stack(level=['animal', 0]),
|
|
animal_hair_stacked, check_names=False)
|
|
|
|
def test_stack_int_level_names(self):
|
|
columns = MultiIndex.from_tuples(
|
|
[('A', 'cat', 'long'), ('B', 'cat', 'long'),
|
|
('A', 'dog', 'short'), ('B', 'dog', 'short')],
|
|
names=['exp', 'animal', 'hair_length']
|
|
)
|
|
df = DataFrame(randn(4, 4), columns=columns)
|
|
|
|
exp_animal_stacked = df.stack(level=['exp', 'animal'])
|
|
animal_hair_stacked = df.stack(level=['animal', 'hair_length'])
|
|
exp_hair_stacked = df.stack(level=['exp', 'hair_length'])
|
|
|
|
df2 = df.copy()
|
|
df2.columns.names = [0, 1, 2]
|
|
assert_frame_equal(df2.stack(level=[1, 2]), animal_hair_stacked,
|
|
check_names=False)
|
|
assert_frame_equal(df2.stack(level=[0, 1]), exp_animal_stacked,
|
|
check_names=False)
|
|
assert_frame_equal(df2.stack(level=[0, 2]), exp_hair_stacked,
|
|
check_names=False)
|
|
|
|
# Out-of-order int column names
|
|
df3 = df.copy()
|
|
df3.columns.names = [2, 0, 1]
|
|
assert_frame_equal(df3.stack(level=[0, 1]), animal_hair_stacked,
|
|
check_names=False)
|
|
assert_frame_equal(df3.stack(level=[2, 0]), exp_animal_stacked,
|
|
check_names=False)
|
|
assert_frame_equal(df3.stack(level=[2, 1]), exp_hair_stacked,
|
|
check_names=False)
|
|
|
|
def test_unstack_bool(self):
|
|
df = DataFrame([False, False],
|
|
index=MultiIndex.from_arrays([['a', 'b'], ['c', 'l']]),
|
|
columns=['col'])
|
|
rs = df.unstack()
|
|
xp = DataFrame(np.array([[False, np.nan], [np.nan, False]],
|
|
dtype=object),
|
|
index=['a', 'b'],
|
|
columns=MultiIndex.from_arrays([['col', 'col'],
|
|
['c', 'l']]))
|
|
assert_frame_equal(rs, xp)
|
|
|
|
def test_unstack_level_binding(self):
|
|
# GH9856
|
|
mi = pd.MultiIndex(
|
|
levels=[[u('foo'), u('bar')], [u('one'), u('two')],
|
|
[u('a'), u('b')]],
|
|
labels=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]],
|
|
names=[u('first'), u('second'), u('third')])
|
|
s = pd.Series(0, index=mi)
|
|
result = s.unstack([1, 2]).stack(0)
|
|
|
|
expected_mi = pd.MultiIndex(
|
|
levels=[['foo', 'bar'], ['one', 'two']],
|
|
labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
|
|
names=['first', 'second'])
|
|
|
|
expected = pd.DataFrame(np.array([[np.nan, 0],
|
|
[0, np.nan],
|
|
[np.nan, 0],
|
|
[0, np.nan]],
|
|
dtype=np.float64),
|
|
index=expected_mi,
|
|
columns=pd.Index(['a', 'b'], name='third'))
|
|
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_unstack_to_series(self):
|
|
# check reversibility
|
|
data = self.frame.unstack()
|
|
|
|
assert isinstance(data, Series)
|
|
undo = data.unstack().T
|
|
assert_frame_equal(undo, self.frame)
|
|
|
|
# check NA handling
|
|
data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]})
|
|
data.index = Index(['a', 'b', 'c'])
|
|
result = data.unstack()
|
|
|
|
midx = MultiIndex(levels=[['x', 'y'], ['a', 'b', 'c']],
|
|
labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])
|
|
expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx)
|
|
|
|
assert_series_equal(result, expected)
|
|
|
|
# check composability of unstack
|
|
old_data = data.copy()
|
|
for _ in range(4):
|
|
data = data.unstack()
|
|
assert_frame_equal(old_data, data)
|
|
|
|
def test_unstack_dtypes(self):
|
|
|
|
# GH 2929
|
|
rows = [[1, 1, 3, 4],
|
|
[1, 2, 3, 4],
|
|
[2, 1, 3, 4],
|
|
[2, 2, 3, 4]]
|
|
|
|
df = DataFrame(rows, columns=list('ABCD'))
|
|
result = df.get_dtype_counts()
|
|
expected = Series({'int64': 4})
|
|
assert_series_equal(result, expected)
|
|
|
|
# single dtype
|
|
df2 = df.set_index(['A', 'B'])
|
|
df3 = df2.unstack('B')
|
|
result = df3.get_dtype_counts()
|
|
expected = Series({'int64': 4})
|
|
assert_series_equal(result, expected)
|
|
|
|
# mixed
|
|
df2 = df.set_index(['A', 'B'])
|
|
df2['C'] = 3.
|
|
df3 = df2.unstack('B')
|
|
result = df3.get_dtype_counts()
|
|
expected = Series({'int64': 2, 'float64': 2})
|
|
assert_series_equal(result, expected)
|
|
|
|
df2['D'] = 'foo'
|
|
df3 = df2.unstack('B')
|
|
result = df3.get_dtype_counts()
|
|
expected = Series({'float64': 2, 'object': 2})
|
|
assert_series_equal(result, expected)
|
|
|
|
# GH7405
|
|
for c, d in (np.zeros(5), np.zeros(5)), \
|
|
(np.arange(5, dtype='f8'), np.arange(5, 10, dtype='f8')):
|
|
|
|
df = DataFrame({'A': ['a'] * 5, 'C': c, 'D': d,
|
|
'B': pd.date_range('2012-01-01', periods=5)})
|
|
|
|
right = df.iloc[:3].copy(deep=True)
|
|
|
|
df = df.set_index(['A', 'B'])
|
|
df['D'] = df['D'].astype('int64')
|
|
|
|
left = df.iloc[:3].unstack(0)
|
|
right = right.set_index(['A', 'B']).unstack(0)
|
|
right[('D', 'a')] = right[('D', 'a')].astype('int64')
|
|
|
|
assert left.shape == (3, 2)
|
|
tm.assert_frame_equal(left, right)
|
|
|
|
def test_unstack_non_unique_index_names(self):
|
|
idx = MultiIndex.from_tuples([('a', 'b'), ('c', 'd')],
|
|
names=['c1', 'c1'])
|
|
df = DataFrame([1, 2], index=idx)
|
|
with pytest.raises(ValueError):
|
|
df.unstack('c1')
|
|
|
|
with pytest.raises(ValueError):
|
|
df.T.stack('c1')
|
|
|
|
def test_unstack_unused_levels(self):
|
|
# GH 17845: unused labels in index make unstack() cast int to float
|
|
idx = pd.MultiIndex.from_product([['a'], ['A', 'B', 'C', 'D']])[:-1]
|
|
df = pd.DataFrame([[1, 0]] * 3, index=idx)
|
|
|
|
result = df.unstack()
|
|
exp_col = pd.MultiIndex.from_product([[0, 1], ['A', 'B', 'C']])
|
|
expected = pd.DataFrame([[1, 1, 1, 0, 0, 0]], index=['a'],
|
|
columns=exp_col)
|
|
tm.assert_frame_equal(result, expected)
|
|
assert((result.columns.levels[1] == idx.levels[1]).all())
|
|
|
|
# Unused items on both levels
|
|
levels = [[0, 1, 7], [0, 1, 2, 3]]
|
|
labels = [[0, 0, 1, 1], [0, 2, 0, 2]]
|
|
idx = pd.MultiIndex(levels, labels)
|
|
block = np.arange(4).reshape(2, 2)
|
|
df = pd.DataFrame(np.concatenate([block, block + 4]), index=idx)
|
|
result = df.unstack()
|
|
expected = pd.DataFrame(np.concatenate([block * 2, block * 2 + 1],
|
|
axis=1),
|
|
columns=idx)
|
|
tm.assert_frame_equal(result, expected)
|
|
assert((result.columns.levels[1] == idx.levels[1]).all())
|
|
|
|
# With mixed dtype and NaN
|
|
levels = [['a', 2, 'c'], [1, 3, 5, 7]]
|
|
labels = [[0, -1, 1, 1], [0, 2, -1, 2]]
|
|
idx = pd.MultiIndex(levels, labels)
|
|
data = np.arange(8)
|
|
df = pd.DataFrame(data.reshape(4, 2), index=idx)
|
|
|
|
cases = ((0, [13, 16, 6, 9, 2, 5, 8, 11],
|
|
[np.nan, 'a', 2], [np.nan, 5, 1]),
|
|
(1, [8, 11, 1, 4, 12, 15, 13, 16],
|
|
[np.nan, 5, 1], [np.nan, 'a', 2]))
|
|
for level, idces, col_level, idx_level in cases:
|
|
result = df.unstack(level=level)
|
|
exp_data = np.zeros(18) * np.nan
|
|
exp_data[idces] = data
|
|
cols = pd.MultiIndex.from_product([[0, 1], col_level])
|
|
expected = pd.DataFrame(exp_data.reshape(3, 6),
|
|
index=idx_level, columns=cols)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize("cols", [['A', 'C'], slice(None)])
|
|
def test_unstack_unused_level(self, cols):
|
|
# GH 18562 : unused labels on the unstacked level
|
|
df = pd.DataFrame([[2010, 'a', 'I'],
|
|
[2011, 'b', 'II']],
|
|
columns=['A', 'B', 'C'])
|
|
|
|
ind = df.set_index(['A', 'B', 'C'], drop=False)
|
|
selection = ind.loc[(slice(None), slice(None), 'I'), cols]
|
|
result = selection.unstack()
|
|
|
|
expected = ind.iloc[[0]][cols]
|
|
expected.columns = MultiIndex.from_product([expected.columns, ['I']],
|
|
names=[None, 'C'])
|
|
expected.index = expected.index.droplevel('C')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_unstack_nan_index(self): # GH7466
|
|
cast = lambda val: '{0:1}'.format('' if val != val else val)
|
|
nan = np.nan
|
|
|
|
def verify(df):
|
|
mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
|
|
rows, cols = df.notna().values.nonzero()
|
|
for i, j in zip(rows, cols):
|
|
left = sorted(df.iloc[i, j].split('.'))
|
|
right = mk_list(df.index[i]) + mk_list(df.columns[j])
|
|
right = sorted(list(map(cast, right)))
|
|
assert left == right
|
|
|
|
df = DataFrame({'jim': ['a', 'b', nan, 'd'],
|
|
'joe': ['w', 'x', 'y', 'z'],
|
|
'jolie': ['a.w', 'b.x', ' .y', 'd.z']})
|
|
|
|
left = df.set_index(['jim', 'joe']).unstack()['jolie']
|
|
right = df.set_index(['joe', 'jim']).unstack()['jolie'].T
|
|
assert_frame_equal(left, right)
|
|
|
|
for idx in itertools.permutations(df.columns[:2]):
|
|
mi = df.set_index(list(idx))
|
|
for lev in range(2):
|
|
udf = mi.unstack(level=lev)
|
|
assert udf.notna().values.sum() == len(df)
|
|
verify(udf['jolie'])
|
|
|
|
df = DataFrame({'1st': ['d'] * 3 + [nan] * 5 + ['a'] * 2 +
|
|
['c'] * 3 + ['e'] * 2 + ['b'] * 5,
|
|
'2nd': ['y'] * 2 + ['w'] * 3 + [nan] * 3 +
|
|
['z'] * 4 + [nan] * 3 + ['x'] * 3 + [nan] * 2,
|
|
'3rd': [67, 39, 53, 72, 57, 80, 31, 18, 11, 30, 59,
|
|
50, 62, 59, 76, 52, 14, 53, 60, 51]})
|
|
|
|
df['4th'], df['5th'] = \
|
|
df.apply(lambda r: '.'.join(map(cast, r)), axis=1), \
|
|
df.apply(lambda r: '.'.join(map(cast, r.iloc[::-1])), axis=1)
|
|
|
|
for idx in itertools.permutations(['1st', '2nd', '3rd']):
|
|
mi = df.set_index(list(idx))
|
|
for lev in range(3):
|
|
udf = mi.unstack(level=lev)
|
|
assert udf.notna().values.sum() == 2 * len(df)
|
|
for col in ['4th', '5th']:
|
|
verify(udf[col])
|
|
|
|
# GH7403
|
|
df = pd.DataFrame(
|
|
{'A': list('aaaabbbb'), 'B': range(8), 'C': range(8)})
|
|
df.iloc[3, 1] = np.NaN
|
|
left = df.set_index(['A', 'B']).unstack(0)
|
|
|
|
vals = [[3, 0, 1, 2, nan, nan, nan, nan],
|
|
[nan, nan, nan, nan, 4, 5, 6, 7]]
|
|
vals = list(map(list, zip(*vals)))
|
|
idx = Index([nan, 0, 1, 2, 4, 5, 6, 7], name='B')
|
|
cols = MultiIndex(levels=[['C'], ['a', 'b']],
|
|
labels=[[0, 0], [0, 1]],
|
|
names=[None, 'A'])
|
|
|
|
right = DataFrame(vals, columns=cols, index=idx)
|
|
assert_frame_equal(left, right)
|
|
|
|
df = DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
|
|
'C': range(8)})
|
|
df.iloc[2, 1] = np.NaN
|
|
left = df.set_index(['A', 'B']).unstack(0)
|
|
|
|
vals = [[2, nan], [0, 4], [1, 5], [nan, 6], [3, 7]]
|
|
cols = MultiIndex(levels=[['C'], ['a', 'b']],
|
|
labels=[[0, 0], [0, 1]],
|
|
names=[None, 'A'])
|
|
idx = Index([nan, 0, 1, 2, 3], name='B')
|
|
right = DataFrame(vals, columns=cols, index=idx)
|
|
assert_frame_equal(left, right)
|
|
|
|
df = pd.DataFrame({'A': list('aaaabbbb'), 'B': list(range(4)) * 2,
|
|
'C': range(8)})
|
|
df.iloc[3, 1] = np.NaN
|
|
left = df.set_index(['A', 'B']).unstack(0)
|
|
|
|
vals = [[3, nan], [0, 4], [1, 5], [2, 6], [nan, 7]]
|
|
cols = MultiIndex(levels=[['C'], ['a', 'b']],
|
|
labels=[[0, 0], [0, 1]],
|
|
names=[None, 'A'])
|
|
idx = Index([nan, 0, 1, 2, 3], name='B')
|
|
right = DataFrame(vals, columns=cols, index=idx)
|
|
assert_frame_equal(left, right)
|
|
|
|
# GH7401
|
|
df = pd.DataFrame({'A': list('aaaaabbbbb'),
|
|
'B': (date_range('2012-01-01', periods=5)
|
|
.tolist() * 2),
|
|
'C': np.arange(10)})
|
|
|
|
df.iloc[3, 1] = np.NaN
|
|
left = df.set_index(['A', 'B']).unstack()
|
|
|
|
vals = np.array([[3, 0, 1, 2, nan, 4], [nan, 5, 6, 7, 8, 9]])
|
|
idx = Index(['a', 'b'], name='A')
|
|
cols = MultiIndex(levels=[['C'], date_range('2012-01-01', periods=5)],
|
|
labels=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
|
|
names=[None, 'B'])
|
|
|
|
right = DataFrame(vals, columns=cols, index=idx)
|
|
assert_frame_equal(left, right)
|
|
|
|
# GH4862
|
|
vals = [['Hg', nan, nan, 680585148],
|
|
['U', 0.0, nan, 680585148],
|
|
['Pb', 7.07e-06, nan, 680585148],
|
|
['Sn', 2.3614e-05, 0.0133, 680607017],
|
|
['Ag', 0.0, 0.0133, 680607017],
|
|
['Hg', -0.00015, 0.0133, 680607017]]
|
|
df = DataFrame(vals, columns=['agent', 'change', 'dosage', 's_id'],
|
|
index=[17263, 17264, 17265, 17266, 17267, 17268])
|
|
|
|
left = df.copy().set_index(['s_id', 'dosage', 'agent']).unstack()
|
|
|
|
vals = [[nan, nan, 7.07e-06, nan, 0.0],
|
|
[0.0, -0.00015, nan, 2.3614e-05, nan]]
|
|
|
|
idx = MultiIndex(levels=[[680585148, 680607017], [0.0133]],
|
|
labels=[[0, 1], [-1, 0]],
|
|
names=['s_id', 'dosage'])
|
|
|
|
cols = MultiIndex(levels=[['change'], ['Ag', 'Hg', 'Pb', 'Sn', 'U']],
|
|
labels=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
|
|
names=[None, 'agent'])
|
|
|
|
right = DataFrame(vals, columns=cols, index=idx)
|
|
assert_frame_equal(left, right)
|
|
|
|
left = df.loc[17264:].copy().set_index(['s_id', 'dosage', 'agent'])
|
|
assert_frame_equal(left.unstack(), right)
|
|
|
|
# GH9497 - multiple unstack with nulls
|
|
df = DataFrame({'1st': [1, 2, 1, 2, 1, 2],
|
|
'2nd': pd.date_range('2014-02-01', periods=6,
|
|
freq='D'),
|
|
'jim': 100 + np.arange(6),
|
|
'joe': (np.random.randn(6) * 10).round(2)})
|
|
|
|
df['3rd'] = df['2nd'] - pd.Timestamp('2014-02-02')
|
|
df.loc[1, '2nd'] = df.loc[3, '2nd'] = nan
|
|
df.loc[1, '3rd'] = df.loc[4, '3rd'] = nan
|
|
|
|
left = df.set_index(['1st', '2nd', '3rd']).unstack(['2nd', '3rd'])
|
|
assert left.notna().values.sum() == 2 * len(df)
|
|
|
|
for col in ['jim', 'joe']:
|
|
for _, r in df.iterrows():
|
|
key = r['1st'], (col, r['2nd'], r['3rd'])
|
|
assert r[col] == left.loc[key]
|
|
|
|
def test_stack_datetime_column_multiIndex(self):
|
|
# GH 8039
|
|
t = datetime(2014, 1, 1)
|
|
df = DataFrame(
|
|
[1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, 'A', 'B')]))
|
|
result = df.stack()
|
|
|
|
eidx = MultiIndex.from_product([(0, 1, 2, 3), ('B',)])
|
|
ecols = MultiIndex.from_tuples([(t, 'A')])
|
|
expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols)
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_stack_partial_multiIndex(self):
|
|
# GH 8844
|
|
def _test_stack_with_multiindex(multiindex):
|
|
df = DataFrame(np.arange(3 * len(multiindex))
|
|
.reshape(3, len(multiindex)),
|
|
columns=multiindex)
|
|
for level in (-1, 0, 1, [0, 1], [1, 0]):
|
|
result = df.stack(level=level, dropna=False)
|
|
|
|
if isinstance(level, int):
|
|
# Stacking a single level should not make any all-NaN rows,
|
|
# so df.stack(level=level, dropna=False) should be the same
|
|
# as df.stack(level=level, dropna=True).
|
|
expected = df.stack(level=level, dropna=True)
|
|
if isinstance(expected, Series):
|
|
assert_series_equal(result, expected)
|
|
else:
|
|
assert_frame_equal(result, expected)
|
|
|
|
df.columns = MultiIndex.from_tuples(df.columns.get_values(),
|
|
names=df.columns.names)
|
|
expected = df.stack(level=level, dropna=False)
|
|
if isinstance(expected, Series):
|
|
assert_series_equal(result, expected)
|
|
else:
|
|
assert_frame_equal(result, expected)
|
|
|
|
full_multiindex = MultiIndex.from_tuples([('B', 'x'), ('B', 'z'),
|
|
('A', 'y'),
|
|
('C', 'x'), ('C', 'u')],
|
|
names=['Upper', 'Lower'])
|
|
for multiindex_columns in ([0, 1, 2, 3, 4],
|
|
[0, 1, 2, 3], [0, 1, 2, 4],
|
|
[0, 1, 2], [1, 2, 3], [2, 3, 4],
|
|
[0, 1], [0, 2], [0, 3],
|
|
[0], [2], [4]):
|
|
_test_stack_with_multiindex(full_multiindex[multiindex_columns])
|
|
if len(multiindex_columns) > 1:
|
|
multiindex_columns.reverse()
|
|
_test_stack_with_multiindex(
|
|
full_multiindex[multiindex_columns])
|
|
|
|
df = DataFrame(np.arange(6).reshape(2, 3),
|
|
columns=full_multiindex[[0, 1, 3]])
|
|
result = df.stack(dropna=False)
|
|
expected = DataFrame([[0, 2], [1, nan], [3, 5], [4, nan]],
|
|
index=MultiIndex(
|
|
levels=[[0, 1], ['u', 'x', 'y', 'z']],
|
|
labels=[[0, 0, 1, 1],
|
|
[1, 3, 1, 3]],
|
|
names=[None, 'Lower']),
|
|
columns=Index(['B', 'C'], name='Upper'),
|
|
dtype=df.dtypes[0])
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_stack_preserve_categorical_dtype(self):
|
|
# GH13854
|
|
for ordered in [False, True]:
|
|
for labels in [list("yxz"), list("yxy")]:
|
|
cidx = pd.CategoricalIndex(labels, categories=list("xyz"),
|
|
ordered=ordered)
|
|
df = DataFrame([[10, 11, 12]], columns=cidx)
|
|
result = df.stack()
|
|
|
|
# `MutliIndex.from_product` preserves categorical dtype -
|
|
# it's tested elsewhere.
|
|
midx = pd.MultiIndex.from_product([df.index, cidx])
|
|
expected = Series([10, 11, 12], index=midx)
|
|
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize("level", [0, 'baz'])
|
|
def test_unstack_swaplevel_sortlevel(self, level):
|
|
# GH 20994
|
|
mi = pd.MultiIndex.from_product([[0], ['d', 'c']],
|
|
names=['bar', 'baz'])
|
|
df = pd.DataFrame([[0, 2], [1, 3]], index=mi, columns=['B', 'A'])
|
|
df.columns.name = 'foo'
|
|
|
|
expected = pd.DataFrame([
|
|
[3, 1, 2, 0]], columns=pd.MultiIndex.from_tuples([
|
|
('c', 'A'), ('c', 'B'), ('d', 'A'), ('d', 'B')], names=[
|
|
'baz', 'foo']))
|
|
expected.index.name = 'bar'
|
|
|
|
result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
|
|
def test_unstack_fill_frame_object():
|
|
# GH12815 Test unstacking with object.
|
|
data = pd.Series(['a', 'b', 'c', 'a'], dtype='object')
|
|
data.index = pd.MultiIndex.from_tuples(
|
|
[('x', 'a'), ('x', 'b'), ('y', 'b'), ('z', 'a')])
|
|
|
|
# By default missing values will be NaN
|
|
result = data.unstack()
|
|
expected = pd.DataFrame(
|
|
{'a': ['a', np.nan, 'a'], 'b': ['b', 'c', np.nan]},
|
|
index=list('xyz')
|
|
)
|
|
assert_frame_equal(result, expected)
|
|
|
|
# Fill with any value replaces missing values as expected
|
|
result = data.unstack(fill_value='d')
|
|
expected = pd.DataFrame(
|
|
{'a': ['a', 'd', 'a'], 'b': ['b', 'c', 'd']},
|
|
index=list('xyz')
|
|
)
|
|
assert_frame_equal(result, expected)
|