643 lines
28 KiB
Python
643 lines
28 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
# pylint: disable-msg=W0612,E1101
|
||
|
|
||
|
import pytest
|
||
|
|
||
|
from pandas import DataFrame
|
||
|
import pandas as pd
|
||
|
|
||
|
from numpy import nan
|
||
|
import numpy as np
|
||
|
|
||
|
from pandas import melt, lreshape, wide_to_long
|
||
|
import pandas.util.testing as tm
|
||
|
from pandas.compat import range
|
||
|
|
||
|
|
||
|
class TestMelt(object):
|
||
|
|
||
|
def setup_method(self, method):
|
||
|
self.df = tm.makeTimeDataFrame()[:10]
|
||
|
self.df['id1'] = (self.df['A'] > 0).astype(np.int64)
|
||
|
self.df['id2'] = (self.df['B'] > 0).astype(np.int64)
|
||
|
|
||
|
self.var_name = 'var'
|
||
|
self.value_name = 'val'
|
||
|
|
||
|
self.df1 = pd.DataFrame([[1.067683, -1.110463, 0.20867
|
||
|
], [-1.321405, 0.368915, -1.055342],
|
||
|
[-0.807333, 0.08298, -0.873361]])
|
||
|
self.df1.columns = [list('ABC'), list('abc')]
|
||
|
self.df1.columns.names = ['CAP', 'low']
|
||
|
|
||
|
def test_top_level_method(self):
|
||
|
result = melt(self.df)
|
||
|
assert result.columns.tolist() == ['variable', 'value']
|
||
|
|
||
|
def test_method_signatures(self):
|
||
|
tm.assert_frame_equal(self.df.melt(),
|
||
|
melt(self.df))
|
||
|
|
||
|
tm.assert_frame_equal(self.df.melt(id_vars=['id1', 'id2'],
|
||
|
value_vars=['A', 'B']),
|
||
|
melt(self.df,
|
||
|
id_vars=['id1', 'id2'],
|
||
|
value_vars=['A', 'B']))
|
||
|
|
||
|
tm.assert_frame_equal(self.df.melt(var_name=self.var_name,
|
||
|
value_name=self.value_name),
|
||
|
melt(self.df,
|
||
|
var_name=self.var_name,
|
||
|
value_name=self.value_name))
|
||
|
|
||
|
tm.assert_frame_equal(self.df1.melt(col_level=0),
|
||
|
melt(self.df1, col_level=0))
|
||
|
|
||
|
def test_default_col_names(self):
|
||
|
result = self.df.melt()
|
||
|
assert result.columns.tolist() == ['variable', 'value']
|
||
|
|
||
|
result1 = self.df.melt(id_vars=['id1'])
|
||
|
assert result1.columns.tolist() == ['id1', 'variable', 'value']
|
||
|
|
||
|
result2 = self.df.melt(id_vars=['id1', 'id2'])
|
||
|
assert result2.columns.tolist() == ['id1', 'id2', 'variable', 'value']
|
||
|
|
||
|
def test_value_vars(self):
|
||
|
result3 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A')
|
||
|
assert len(result3) == 10
|
||
|
|
||
|
result4 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'])
|
||
|
expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2,
|
||
|
'id2': self.df['id2'].tolist() * 2,
|
||
|
'variable': ['A'] * 10 + ['B'] * 10,
|
||
|
'value': (self.df['A'].tolist() +
|
||
|
self.df['B'].tolist())},
|
||
|
columns=['id1', 'id2', 'variable', 'value'])
|
||
|
tm.assert_frame_equal(result4, expected4)
|
||
|
|
||
|
def test_value_vars_types(self):
|
||
|
# GH 15348
|
||
|
expected = DataFrame({'id1': self.df['id1'].tolist() * 2,
|
||
|
'id2': self.df['id2'].tolist() * 2,
|
||
|
'variable': ['A'] * 10 + ['B'] * 10,
|
||
|
'value': (self.df['A'].tolist() +
|
||
|
self.df['B'].tolist())},
|
||
|
columns=['id1', 'id2', 'variable', 'value'])
|
||
|
|
||
|
for type_ in (tuple, list, np.array):
|
||
|
result = self.df.melt(id_vars=['id1', 'id2'],
|
||
|
value_vars=type_(('A', 'B')))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_vars_work_with_multiindex(self):
|
||
|
expected = DataFrame({
|
||
|
('A', 'a'): self.df1[('A', 'a')],
|
||
|
'CAP': ['B'] * len(self.df1),
|
||
|
'low': ['b'] * len(self.df1),
|
||
|
'value': self.df1[('B', 'b')],
|
||
|
}, columns=[('A', 'a'), 'CAP', 'low', 'value'])
|
||
|
|
||
|
result = self.df1.melt(id_vars=[('A', 'a')], value_vars=[('B', 'b')])
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_tuple_vars_fail_with_multiindex(self):
|
||
|
# melt should fail with an informative error message if
|
||
|
# the columns have a MultiIndex and a tuple is passed
|
||
|
# for id_vars or value_vars.
|
||
|
tuple_a = ('A', 'a')
|
||
|
list_a = [tuple_a]
|
||
|
tuple_b = ('B', 'b')
|
||
|
list_b = [tuple_b]
|
||
|
|
||
|
for id_vars, value_vars in ((tuple_a, list_b), (list_a, tuple_b),
|
||
|
(tuple_a, tuple_b)):
|
||
|
with tm.assert_raises_regex(ValueError, r'MultiIndex'):
|
||
|
self.df1.melt(id_vars=id_vars, value_vars=value_vars)
|
||
|
|
||
|
def test_custom_var_name(self):
|
||
|
result5 = self.df.melt(var_name=self.var_name)
|
||
|
assert result5.columns.tolist() == ['var', 'value']
|
||
|
|
||
|
result6 = self.df.melt(id_vars=['id1'], var_name=self.var_name)
|
||
|
assert result6.columns.tolist() == ['id1', 'var', 'value']
|
||
|
|
||
|
result7 = self.df.melt(id_vars=['id1', 'id2'], var_name=self.var_name)
|
||
|
assert result7.columns.tolist() == ['id1', 'id2', 'var', 'value']
|
||
|
|
||
|
result8 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
|
||
|
var_name=self.var_name)
|
||
|
assert result8.columns.tolist() == ['id1', 'id2', 'var', 'value']
|
||
|
|
||
|
result9 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
|
||
|
var_name=self.var_name)
|
||
|
expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2,
|
||
|
'id2': self.df['id2'].tolist() * 2,
|
||
|
self.var_name: ['A'] * 10 + ['B'] * 10,
|
||
|
'value': (self.df['A'].tolist() +
|
||
|
self.df['B'].tolist())},
|
||
|
columns=['id1', 'id2', self.var_name, 'value'])
|
||
|
tm.assert_frame_equal(result9, expected9)
|
||
|
|
||
|
def test_custom_value_name(self):
|
||
|
result10 = self.df.melt(value_name=self.value_name)
|
||
|
assert result10.columns.tolist() == ['variable', 'val']
|
||
|
|
||
|
result11 = self.df.melt(id_vars=['id1'], value_name=self.value_name)
|
||
|
assert result11.columns.tolist() == ['id1', 'variable', 'val']
|
||
|
|
||
|
result12 = self.df.melt(id_vars=['id1', 'id2'],
|
||
|
value_name=self.value_name)
|
||
|
assert result12.columns.tolist() == ['id1', 'id2', 'variable', 'val']
|
||
|
|
||
|
result13 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
|
||
|
value_name=self.value_name)
|
||
|
assert result13.columns.tolist() == ['id1', 'id2', 'variable', 'val']
|
||
|
|
||
|
result14 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
|
||
|
value_name=self.value_name)
|
||
|
expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2,
|
||
|
'id2': self.df['id2'].tolist() * 2,
|
||
|
'variable': ['A'] * 10 + ['B'] * 10,
|
||
|
self.value_name: (self.df['A'].tolist() +
|
||
|
self.df['B'].tolist())},
|
||
|
columns=['id1', 'id2', 'variable',
|
||
|
self.value_name])
|
||
|
tm.assert_frame_equal(result14, expected14)
|
||
|
|
||
|
def test_custom_var_and_value_name(self):
|
||
|
|
||
|
result15 = self.df.melt(var_name=self.var_name,
|
||
|
value_name=self.value_name)
|
||
|
assert result15.columns.tolist() == ['var', 'val']
|
||
|
|
||
|
result16 = self.df.melt(id_vars=['id1'], var_name=self.var_name,
|
||
|
value_name=self.value_name)
|
||
|
assert result16.columns.tolist() == ['id1', 'var', 'val']
|
||
|
|
||
|
result17 = self.df.melt(id_vars=['id1', 'id2'],
|
||
|
var_name=self.var_name,
|
||
|
value_name=self.value_name)
|
||
|
assert result17.columns.tolist() == ['id1', 'id2', 'var', 'val']
|
||
|
|
||
|
result18 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
|
||
|
var_name=self.var_name,
|
||
|
value_name=self.value_name)
|
||
|
assert result18.columns.tolist() == ['id1', 'id2', 'var', 'val']
|
||
|
|
||
|
result19 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
|
||
|
var_name=self.var_name,
|
||
|
value_name=self.value_name)
|
||
|
expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2,
|
||
|
'id2': self.df['id2'].tolist() * 2,
|
||
|
self.var_name: ['A'] * 10 + ['B'] * 10,
|
||
|
self.value_name: (self.df['A'].tolist() +
|
||
|
self.df['B'].tolist())},
|
||
|
columns=['id1', 'id2', self.var_name,
|
||
|
self.value_name])
|
||
|
tm.assert_frame_equal(result19, expected19)
|
||
|
|
||
|
df20 = self.df.copy()
|
||
|
df20.columns.name = 'foo'
|
||
|
result20 = df20.melt()
|
||
|
assert result20.columns.tolist() == ['foo', 'value']
|
||
|
|
||
|
def test_col_level(self):
|
||
|
res1 = self.df1.melt(col_level=0)
|
||
|
res2 = self.df1.melt(col_level='CAP')
|
||
|
assert res1.columns.tolist() == ['CAP', 'value']
|
||
|
assert res2.columns.tolist() == ['CAP', 'value']
|
||
|
|
||
|
def test_multiindex(self):
|
||
|
res = self.df1.melt()
|
||
|
assert res.columns.tolist() == ['CAP', 'low', 'value']
|
||
|
|
||
|
@pytest.mark.parametrize("col", [
|
||
|
pd.Series(pd.date_range('2010', periods=5, tz='US/Pacific')),
|
||
|
pd.Series(["a", "b", "c", "a", "d"], dtype="category"),
|
||
|
pd.Series([0, 1, 0, 0, 0])])
|
||
|
def test_pandas_dtypes(self, col):
|
||
|
# GH 15785
|
||
|
df = DataFrame({'klass': range(5),
|
||
|
'col': col,
|
||
|
'attr1': [1, 0, 0, 0, 0],
|
||
|
'attr2': col})
|
||
|
expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col],
|
||
|
ignore_index=True)
|
||
|
result = melt(df, id_vars=['klass', 'col'], var_name='attribute',
|
||
|
value_name='value')
|
||
|
expected = DataFrame({0: list(range(5)) * 2,
|
||
|
1: pd.concat([col] * 2, ignore_index=True),
|
||
|
2: ['attr1'] * 5 + ['attr2'] * 5,
|
||
|
3: expected_value})
|
||
|
expected.columns = ['klass', 'col', 'attribute', 'value']
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
class TestLreshape(object):
|
||
|
|
||
|
def test_pairs(self):
|
||
|
data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008',
|
||
|
'11jan2009'],
|
||
|
'birthwt': [1766, 3301, 1454, 3139, 4133],
|
||
|
'id': [101, 102, 103, 104, 105],
|
||
|
'sex': ['Male', 'Female', 'Female', 'Female', 'Female'],
|
||
|
'visitdt1': ['11jan2009', '22dec2008', '04jan2009',
|
||
|
'29dec2008', '20jan2009'],
|
||
|
'visitdt2':
|
||
|
['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'],
|
||
|
'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'],
|
||
|
'wt1': [1823, 3338, 1549, 3298, 4306],
|
||
|
'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0],
|
||
|
'wt3': [2293.0, nan, nan, 3377.0, 4805.0]}
|
||
|
|
||
|
df = DataFrame(data)
|
||
|
|
||
|
spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)],
|
||
|
'wt': ['wt%d' % i for i in range(1, 4)]}
|
||
|
result = lreshape(df, spec)
|
||
|
|
||
|
exp_data = {'birthdt':
|
||
|
['08jan2009', '20dec2008', '30dec2008', '21dec2008',
|
||
|
'11jan2009', '08jan2009', '30dec2008', '21dec2008',
|
||
|
'11jan2009', '08jan2009', '21dec2008', '11jan2009'],
|
||
|
'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139,
|
||
|
4133, 1766, 3139, 4133],
|
||
|
'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101,
|
||
|
104, 105],
|
||
|
'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
|
||
|
'Male', 'Female', 'Female', 'Female', 'Male',
|
||
|
'Female', 'Female'],
|
||
|
'visitdt': ['11jan2009', '22dec2008', '04jan2009',
|
||
|
'29dec2008', '20jan2009', '21jan2009',
|
||
|
'22jan2009', '31dec2008', '03feb2009',
|
||
|
'05feb2009', '02jan2009', '15feb2009'],
|
||
|
'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0,
|
||
|
1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]}
|
||
|
exp = DataFrame(exp_data, columns=result.columns)
|
||
|
tm.assert_frame_equal(result, exp)
|
||
|
|
||
|
result = lreshape(df, spec, dropna=False)
|
||
|
exp_data = {'birthdt':
|
||
|
['08jan2009', '20dec2008', '30dec2008', '21dec2008',
|
||
|
'11jan2009', '08jan2009', '20dec2008', '30dec2008',
|
||
|
'21dec2008', '11jan2009', '08jan2009', '20dec2008',
|
||
|
'30dec2008', '21dec2008', '11jan2009'],
|
||
|
'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454,
|
||
|
3139, 4133, 1766, 3301, 1454, 3139, 4133],
|
||
|
'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105,
|
||
|
101, 102, 103, 104, 105],
|
||
|
'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
|
||
|
'Male', 'Female', 'Female', 'Female', 'Female',
|
||
|
'Male', 'Female', 'Female', 'Female', 'Female'],
|
||
|
'visitdt': ['11jan2009', '22dec2008', '04jan2009',
|
||
|
'29dec2008', '20jan2009', '21jan2009', nan,
|
||
|
'22jan2009', '31dec2008', '03feb2009',
|
||
|
'05feb2009', nan, nan, '02jan2009',
|
||
|
'15feb2009'],
|
||
|
'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan,
|
||
|
1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0,
|
||
|
4805.0]}
|
||
|
exp = DataFrame(exp_data, columns=result.columns)
|
||
|
tm.assert_frame_equal(result, exp)
|
||
|
|
||
|
spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)],
|
||
|
'wt': ['wt%d' % i for i in range(1, 4)]}
|
||
|
pytest.raises(ValueError, lreshape, df, spec)
|
||
|
|
||
|
|
||
|
class TestWideToLong(object):
|
||
|
|
||
|
def test_simple(self):
|
||
|
np.random.seed(123)
|
||
|
x = np.random.randn(3)
|
||
|
df = pd.DataFrame({"A1970": {0: "a",
|
||
|
1: "b",
|
||
|
2: "c"},
|
||
|
"A1980": {0: "d",
|
||
|
1: "e",
|
||
|
2: "f"},
|
||
|
"B1970": {0: 2.5,
|
||
|
1: 1.2,
|
||
|
2: .7},
|
||
|
"B1980": {0: 3.2,
|
||
|
1: 1.3,
|
||
|
2: .1},
|
||
|
"X": dict(zip(
|
||
|
range(3), x))})
|
||
|
df["id"] = df.index
|
||
|
exp_data = {"X": x.tolist() + x.tolist(),
|
||
|
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
|
||
|
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
|
||
|
"year": [1970, 1970, 1970, 1980, 1980, 1980],
|
||
|
"id": [0, 1, 2, 0, 1, 2]}
|
||
|
expected = DataFrame(exp_data)
|
||
|
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
|
||
|
result = wide_to_long(df, ["A", "B"], i="id", j="year")
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_stubs(self):
|
||
|
# GH9204
|
||
|
df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]])
|
||
|
df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2']
|
||
|
stubs = ['inc', 'edu']
|
||
|
|
||
|
# TODO: unused?
|
||
|
df_long = pd.wide_to_long(df, stubs, i='id', j='age') # noqa
|
||
|
|
||
|
assert stubs == ['inc', 'edu']
|
||
|
|
||
|
def test_separating_character(self):
|
||
|
# GH14779
|
||
|
np.random.seed(123)
|
||
|
x = np.random.randn(3)
|
||
|
df = pd.DataFrame({"A.1970": {0: "a",
|
||
|
1: "b",
|
||
|
2: "c"},
|
||
|
"A.1980": {0: "d",
|
||
|
1: "e",
|
||
|
2: "f"},
|
||
|
"B.1970": {0: 2.5,
|
||
|
1: 1.2,
|
||
|
2: .7},
|
||
|
"B.1980": {0: 3.2,
|
||
|
1: 1.3,
|
||
|
2: .1},
|
||
|
"X": dict(zip(
|
||
|
range(3), x))})
|
||
|
df["id"] = df.index
|
||
|
exp_data = {"X": x.tolist() + x.tolist(),
|
||
|
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
|
||
|
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
|
||
|
"year": [1970, 1970, 1970, 1980, 1980, 1980],
|
||
|
"id": [0, 1, 2, 0, 1, 2]}
|
||
|
expected = DataFrame(exp_data)
|
||
|
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
|
||
|
result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_escapable_characters(self):
|
||
|
np.random.seed(123)
|
||
|
x = np.random.randn(3)
|
||
|
df = pd.DataFrame({"A(quarterly)1970": {0: "a",
|
||
|
1: "b",
|
||
|
2: "c"},
|
||
|
"A(quarterly)1980": {0: "d",
|
||
|
1: "e",
|
||
|
2: "f"},
|
||
|
"B(quarterly)1970": {0: 2.5,
|
||
|
1: 1.2,
|
||
|
2: .7},
|
||
|
"B(quarterly)1980": {0: 3.2,
|
||
|
1: 1.3,
|
||
|
2: .1},
|
||
|
"X": dict(zip(
|
||
|
range(3), x))})
|
||
|
df["id"] = df.index
|
||
|
exp_data = {"X": x.tolist() + x.tolist(),
|
||
|
"A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'],
|
||
|
"B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
|
||
|
"year": [1970, 1970, 1970, 1980, 1980, 1980],
|
||
|
"id": [0, 1, 2, 0, 1, 2]}
|
||
|
expected = DataFrame(exp_data)
|
||
|
expected = expected.set_index(
|
||
|
['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]]
|
||
|
result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"],
|
||
|
i="id", j="year")
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_unbalanced(self):
|
||
|
# test that we can have a varying amount of time variables
|
||
|
df = pd.DataFrame({'A2010': [1.0, 2.0],
|
||
|
'A2011': [3.0, 4.0],
|
||
|
'B2010': [5.0, 6.0],
|
||
|
'X': ['X1', 'X2']})
|
||
|
df['id'] = df.index
|
||
|
exp_data = {'X': ['X1', 'X1', 'X2', 'X2'],
|
||
|
'A': [1.0, 3.0, 2.0, 4.0],
|
||
|
'B': [5.0, np.nan, 6.0, np.nan],
|
||
|
'id': [0, 0, 1, 1],
|
||
|
'year': [2010, 2011, 2010, 2011]}
|
||
|
expected = pd.DataFrame(exp_data)
|
||
|
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
|
||
|
result = wide_to_long(df, ['A', 'B'], i='id', j='year')
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_character_overlap(self):
|
||
|
# Test we handle overlapping characters in both id_vars and value_vars
|
||
|
df = pd.DataFrame({
|
||
|
'A11': ['a11', 'a22', 'a33'],
|
||
|
'A12': ['a21', 'a22', 'a23'],
|
||
|
'B11': ['b11', 'b12', 'b13'],
|
||
|
'B12': ['b21', 'b22', 'b23'],
|
||
|
'BB11': [1, 2, 3],
|
||
|
'BB12': [4, 5, 6],
|
||
|
'BBBX': [91, 92, 93],
|
||
|
'BBBZ': [91, 92, 93]
|
||
|
})
|
||
|
df['id'] = df.index
|
||
|
expected = pd.DataFrame({
|
||
|
'BBBX': [91, 92, 93, 91, 92, 93],
|
||
|
'BBBZ': [91, 92, 93, 91, 92, 93],
|
||
|
'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
|
||
|
'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
|
||
|
'BB': [1, 2, 3, 4, 5, 6],
|
||
|
'id': [0, 1, 2, 0, 1, 2],
|
||
|
'year': [11, 11, 11, 12, 12, 12]})
|
||
|
expected = expected.set_index(['id', 'year'])[
|
||
|
['BBBX', 'BBBZ', 'A', 'B', 'BB']]
|
||
|
result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
|
||
|
tm.assert_frame_equal(result.sort_index(axis=1),
|
||
|
expected.sort_index(axis=1))
|
||
|
|
||
|
def test_invalid_separator(self):
|
||
|
# if an invalid separator is supplied a empty data frame is returned
|
||
|
sep = 'nope!'
|
||
|
df = pd.DataFrame({'A2010': [1.0, 2.0],
|
||
|
'A2011': [3.0, 4.0],
|
||
|
'B2010': [5.0, 6.0],
|
||
|
'X': ['X1', 'X2']})
|
||
|
df['id'] = df.index
|
||
|
exp_data = {'X': '',
|
||
|
'A2010': [],
|
||
|
'A2011': [],
|
||
|
'B2010': [],
|
||
|
'id': [],
|
||
|
'year': [],
|
||
|
'A': [],
|
||
|
'B': []}
|
||
|
expected = pd.DataFrame(exp_data).astype({'year': 'int'})
|
||
|
expected = expected.set_index(['id', 'year'])[[
|
||
|
'X', 'A2010', 'A2011', 'B2010', 'A', 'B']]
|
||
|
expected.index.set_levels([0, 1], level=0, inplace=True)
|
||
|
result = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep)
|
||
|
tm.assert_frame_equal(result.sort_index(axis=1),
|
||
|
expected.sort_index(axis=1))
|
||
|
|
||
|
def test_num_string_disambiguation(self):
|
||
|
# Test that we can disambiguate number value_vars from
|
||
|
# string value_vars
|
||
|
df = pd.DataFrame({
|
||
|
'A11': ['a11', 'a22', 'a33'],
|
||
|
'A12': ['a21', 'a22', 'a23'],
|
||
|
'B11': ['b11', 'b12', 'b13'],
|
||
|
'B12': ['b21', 'b22', 'b23'],
|
||
|
'BB11': [1, 2, 3],
|
||
|
'BB12': [4, 5, 6],
|
||
|
'Arating': [91, 92, 93],
|
||
|
'Arating_old': [91, 92, 93]
|
||
|
})
|
||
|
df['id'] = df.index
|
||
|
expected = pd.DataFrame({
|
||
|
'Arating': [91, 92, 93, 91, 92, 93],
|
||
|
'Arating_old': [91, 92, 93, 91, 92, 93],
|
||
|
'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
|
||
|
'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
|
||
|
'BB': [1, 2, 3, 4, 5, 6],
|
||
|
'id': [0, 1, 2, 0, 1, 2],
|
||
|
'year': [11, 11, 11, 12, 12, 12]})
|
||
|
expected = expected.set_index(['id', 'year'])[
|
||
|
['Arating', 'Arating_old', 'A', 'B', 'BB']]
|
||
|
result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
|
||
|
tm.assert_frame_equal(result.sort_index(axis=1),
|
||
|
expected.sort_index(axis=1))
|
||
|
|
||
|
def test_invalid_suffixtype(self):
|
||
|
# If all stubs names end with a string, but a numeric suffix is
|
||
|
# assumed, an empty data frame is returned
|
||
|
df = pd.DataFrame({'Aone': [1.0, 2.0],
|
||
|
'Atwo': [3.0, 4.0],
|
||
|
'Bone': [5.0, 6.0],
|
||
|
'X': ['X1', 'X2']})
|
||
|
df['id'] = df.index
|
||
|
exp_data = {'X': '',
|
||
|
'Aone': [],
|
||
|
'Atwo': [],
|
||
|
'Bone': [],
|
||
|
'id': [],
|
||
|
'year': [],
|
||
|
'A': [],
|
||
|
'B': []}
|
||
|
expected = pd.DataFrame(exp_data).astype({'year': 'int'})
|
||
|
|
||
|
expected = expected.set_index(['id', 'year'])
|
||
|
expected.index.set_levels([0, 1], level=0, inplace=True)
|
||
|
result = wide_to_long(df, ['A', 'B'], i='id', j='year')
|
||
|
tm.assert_frame_equal(result.sort_index(axis=1),
|
||
|
expected.sort_index(axis=1))
|
||
|
|
||
|
def test_multiple_id_columns(self):
|
||
|
# Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
|
||
|
df = pd.DataFrame({
|
||
|
'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
|
||
|
'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
|
||
|
'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
|
||
|
'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
|
||
|
})
|
||
|
expected = pd.DataFrame({
|
||
|
'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8,
|
||
|
2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9],
|
||
|
'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
|
||
|
'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
|
||
|
'age': [1, 2, 1, 2, 1, 2, 1, 2, 1,
|
||
|
2, 1, 2, 1, 2, 1, 2, 1, 2]
|
||
|
})
|
||
|
expected = expected.set_index(['famid', 'birth', 'age'])[['ht']]
|
||
|
result = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age')
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_non_unique_idvars(self):
|
||
|
# GH16382
|
||
|
# Raise an error message if non unique id vars (i) are passed
|
||
|
df = pd.DataFrame({
|
||
|
'A_A1': [1, 2, 3, 4, 5],
|
||
|
'B_B1': [1, 2, 3, 4, 5],
|
||
|
'x': [1, 1, 1, 1, 1]
|
||
|
})
|
||
|
with pytest.raises(ValueError):
|
||
|
wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname')
|
||
|
|
||
|
def test_cast_j_int(self):
|
||
|
df = pd.DataFrame({
|
||
|
'actor_1': ['CCH Pounder', 'Johnny Depp', 'Christoph Waltz'],
|
||
|
'actor_2': ['Joel David Moore', 'Orlando Bloom', 'Rory Kinnear'],
|
||
|
'actor_fb_likes_1': [1000.0, 40000.0, 11000.0],
|
||
|
'actor_fb_likes_2': [936.0, 5000.0, 393.0],
|
||
|
'title': ['Avatar', "Pirates of the Caribbean", 'Spectre']})
|
||
|
|
||
|
expected = pd.DataFrame({
|
||
|
'actor': ['CCH Pounder',
|
||
|
'Johnny Depp',
|
||
|
'Christoph Waltz',
|
||
|
'Joel David Moore',
|
||
|
'Orlando Bloom',
|
||
|
'Rory Kinnear'],
|
||
|
'actor_fb_likes': [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
|
||
|
'num': [1, 1, 1, 2, 2, 2],
|
||
|
'title': ['Avatar',
|
||
|
'Pirates of the Caribbean',
|
||
|
'Spectre',
|
||
|
'Avatar',
|
||
|
'Pirates of the Caribbean',
|
||
|
'Spectre']}).set_index(['title', 'num'])
|
||
|
result = wide_to_long(df, ['actor', 'actor_fb_likes'],
|
||
|
i='title', j='num', sep='_')
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_identical_stubnames(self):
|
||
|
df = pd.DataFrame({'A2010': [1.0, 2.0],
|
||
|
'A2011': [3.0, 4.0],
|
||
|
'B2010': [5.0, 6.0],
|
||
|
'A': ['X1', 'X2']})
|
||
|
with pytest.raises(ValueError):
|
||
|
wide_to_long(df, ['A', 'B'], i='A', j='colname')
|
||
|
|
||
|
def test_nonnumeric_suffix(self):
|
||
|
df = pd.DataFrame({'treatment_placebo': [1.0, 2.0],
|
||
|
'treatment_test': [3.0, 4.0],
|
||
|
'result_placebo': [5.0, 6.0],
|
||
|
'A': ['X1', 'X2']})
|
||
|
expected = pd.DataFrame({
|
||
|
'A': ['X1', 'X1', 'X2', 'X2'],
|
||
|
'colname': ['placebo', 'test', 'placebo', 'test'],
|
||
|
'result': [5.0, np.nan, 6.0, np.nan],
|
||
|
'treatment': [1.0, 3.0, 2.0, 4.0]})
|
||
|
expected = expected.set_index(['A', 'colname'])
|
||
|
result = wide_to_long(df, ['result', 'treatment'],
|
||
|
i='A', j='colname', suffix='[a-z]+', sep='_')
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_mixed_type_suffix(self):
|
||
|
df = pd.DataFrame({
|
||
|
'A': ['X1', 'X2'],
|
||
|
'result_1': [0, 9],
|
||
|
'result_foo': [5.0, 6.0],
|
||
|
'treatment_1': [1.0, 2.0],
|
||
|
'treatment_foo': [3.0, 4.0]})
|
||
|
expected = pd.DataFrame({
|
||
|
'A': ['X1', 'X2', 'X1', 'X2'],
|
||
|
'colname': ['1', '1', 'foo', 'foo'],
|
||
|
'result': [0.0, 9.0, 5.0, 6.0],
|
||
|
'treatment': [1.0, 2.0, 3.0, 4.0]}).set_index(['A', 'colname'])
|
||
|
result = wide_to_long(df, ['result', 'treatment'],
|
||
|
i='A', j='colname', suffix='.+', sep='_')
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_float_suffix(self):
|
||
|
df = pd.DataFrame({
|
||
|
'treatment_1.1': [1.0, 2.0],
|
||
|
'treatment_2.1': [3.0, 4.0],
|
||
|
'result_1.2': [5.0, 6.0],
|
||
|
'result_1': [0, 9],
|
||
|
'A': ['X1', 'X2']})
|
||
|
expected = pd.DataFrame({
|
||
|
'A': ['X1', 'X1', 'X1', 'X1', 'X2', 'X2', 'X2', 'X2'],
|
||
|
'colname': [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1],
|
||
|
'result': [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan],
|
||
|
'treatment': [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0]})
|
||
|
expected = expected.set_index(['A', 'colname'])
|
||
|
result = wide_to_long(df, ['result', 'treatment'],
|
||
|
i='A', j='colname', suffix='[0-9.]+', sep='_')
|
||
|
tm.assert_frame_equal(result, expected)
|