laywerrobot/lib/python3.6/site-packages/pandas/tests/reshape/test_melt.py

643 lines
28 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
# -*- coding: utf-8 -*-
# pylint: disable-msg=W0612,E1101
import pytest
from pandas import DataFrame
import pandas as pd
from numpy import nan
import numpy as np
from pandas import melt, lreshape, wide_to_long
import pandas.util.testing as tm
from pandas.compat import range
class TestMelt(object):
def setup_method(self, method):
self.df = tm.makeTimeDataFrame()[:10]
self.df['id1'] = (self.df['A'] > 0).astype(np.int64)
self.df['id2'] = (self.df['B'] > 0).astype(np.int64)
self.var_name = 'var'
self.value_name = 'val'
self.df1 = pd.DataFrame([[1.067683, -1.110463, 0.20867
], [-1.321405, 0.368915, -1.055342],
[-0.807333, 0.08298, -0.873361]])
self.df1.columns = [list('ABC'), list('abc')]
self.df1.columns.names = ['CAP', 'low']
def test_top_level_method(self):
result = melt(self.df)
assert result.columns.tolist() == ['variable', 'value']
def test_method_signatures(self):
tm.assert_frame_equal(self.df.melt(),
melt(self.df))
tm.assert_frame_equal(self.df.melt(id_vars=['id1', 'id2'],
value_vars=['A', 'B']),
melt(self.df,
id_vars=['id1', 'id2'],
value_vars=['A', 'B']))
tm.assert_frame_equal(self.df.melt(var_name=self.var_name,
value_name=self.value_name),
melt(self.df,
var_name=self.var_name,
value_name=self.value_name))
tm.assert_frame_equal(self.df1.melt(col_level=0),
melt(self.df1, col_level=0))
def test_default_col_names(self):
result = self.df.melt()
assert result.columns.tolist() == ['variable', 'value']
result1 = self.df.melt(id_vars=['id1'])
assert result1.columns.tolist() == ['id1', 'variable', 'value']
result2 = self.df.melt(id_vars=['id1', 'id2'])
assert result2.columns.tolist() == ['id1', 'id2', 'variable', 'value']
def test_value_vars(self):
result3 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A')
assert len(result3) == 10
result4 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'])
expected4 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
'variable': ['A'] * 10 + ['B'] * 10,
'value': (self.df['A'].tolist() +
self.df['B'].tolist())},
columns=['id1', 'id2', 'variable', 'value'])
tm.assert_frame_equal(result4, expected4)
def test_value_vars_types(self):
# GH 15348
expected = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
'variable': ['A'] * 10 + ['B'] * 10,
'value': (self.df['A'].tolist() +
self.df['B'].tolist())},
columns=['id1', 'id2', 'variable', 'value'])
for type_ in (tuple, list, np.array):
result = self.df.melt(id_vars=['id1', 'id2'],
value_vars=type_(('A', 'B')))
tm.assert_frame_equal(result, expected)
def test_vars_work_with_multiindex(self):
expected = DataFrame({
('A', 'a'): self.df1[('A', 'a')],
'CAP': ['B'] * len(self.df1),
'low': ['b'] * len(self.df1),
'value': self.df1[('B', 'b')],
}, columns=[('A', 'a'), 'CAP', 'low', 'value'])
result = self.df1.melt(id_vars=[('A', 'a')], value_vars=[('B', 'b')])
tm.assert_frame_equal(result, expected)
def test_tuple_vars_fail_with_multiindex(self):
# melt should fail with an informative error message if
# the columns have a MultiIndex and a tuple is passed
# for id_vars or value_vars.
tuple_a = ('A', 'a')
list_a = [tuple_a]
tuple_b = ('B', 'b')
list_b = [tuple_b]
for id_vars, value_vars in ((tuple_a, list_b), (list_a, tuple_b),
(tuple_a, tuple_b)):
with tm.assert_raises_regex(ValueError, r'MultiIndex'):
self.df1.melt(id_vars=id_vars, value_vars=value_vars)
def test_custom_var_name(self):
result5 = self.df.melt(var_name=self.var_name)
assert result5.columns.tolist() == ['var', 'value']
result6 = self.df.melt(id_vars=['id1'], var_name=self.var_name)
assert result6.columns.tolist() == ['id1', 'var', 'value']
result7 = self.df.melt(id_vars=['id1', 'id2'], var_name=self.var_name)
assert result7.columns.tolist() == ['id1', 'id2', 'var', 'value']
result8 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
var_name=self.var_name)
assert result8.columns.tolist() == ['id1', 'id2', 'var', 'value']
result9 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
var_name=self.var_name)
expected9 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
self.var_name: ['A'] * 10 + ['B'] * 10,
'value': (self.df['A'].tolist() +
self.df['B'].tolist())},
columns=['id1', 'id2', self.var_name, 'value'])
tm.assert_frame_equal(result9, expected9)
def test_custom_value_name(self):
result10 = self.df.melt(value_name=self.value_name)
assert result10.columns.tolist() == ['variable', 'val']
result11 = self.df.melt(id_vars=['id1'], value_name=self.value_name)
assert result11.columns.tolist() == ['id1', 'variable', 'val']
result12 = self.df.melt(id_vars=['id1', 'id2'],
value_name=self.value_name)
assert result12.columns.tolist() == ['id1', 'id2', 'variable', 'val']
result13 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
value_name=self.value_name)
assert result13.columns.tolist() == ['id1', 'id2', 'variable', 'val']
result14 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
value_name=self.value_name)
expected14 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
'variable': ['A'] * 10 + ['B'] * 10,
self.value_name: (self.df['A'].tolist() +
self.df['B'].tolist())},
columns=['id1', 'id2', 'variable',
self.value_name])
tm.assert_frame_equal(result14, expected14)
def test_custom_var_and_value_name(self):
result15 = self.df.melt(var_name=self.var_name,
value_name=self.value_name)
assert result15.columns.tolist() == ['var', 'val']
result16 = self.df.melt(id_vars=['id1'], var_name=self.var_name,
value_name=self.value_name)
assert result16.columns.tolist() == ['id1', 'var', 'val']
result17 = self.df.melt(id_vars=['id1', 'id2'],
var_name=self.var_name,
value_name=self.value_name)
assert result17.columns.tolist() == ['id1', 'id2', 'var', 'val']
result18 = self.df.melt(id_vars=['id1', 'id2'], value_vars='A',
var_name=self.var_name,
value_name=self.value_name)
assert result18.columns.tolist() == ['id1', 'id2', 'var', 'val']
result19 = self.df.melt(id_vars=['id1', 'id2'], value_vars=['A', 'B'],
var_name=self.var_name,
value_name=self.value_name)
expected19 = DataFrame({'id1': self.df['id1'].tolist() * 2,
'id2': self.df['id2'].tolist() * 2,
self.var_name: ['A'] * 10 + ['B'] * 10,
self.value_name: (self.df['A'].tolist() +
self.df['B'].tolist())},
columns=['id1', 'id2', self.var_name,
self.value_name])
tm.assert_frame_equal(result19, expected19)
df20 = self.df.copy()
df20.columns.name = 'foo'
result20 = df20.melt()
assert result20.columns.tolist() == ['foo', 'value']
def test_col_level(self):
res1 = self.df1.melt(col_level=0)
res2 = self.df1.melt(col_level='CAP')
assert res1.columns.tolist() == ['CAP', 'value']
assert res2.columns.tolist() == ['CAP', 'value']
def test_multiindex(self):
res = self.df1.melt()
assert res.columns.tolist() == ['CAP', 'low', 'value']
@pytest.mark.parametrize("col", [
pd.Series(pd.date_range('2010', periods=5, tz='US/Pacific')),
pd.Series(["a", "b", "c", "a", "d"], dtype="category"),
pd.Series([0, 1, 0, 0, 0])])
def test_pandas_dtypes(self, col):
# GH 15785
df = DataFrame({'klass': range(5),
'col': col,
'attr1': [1, 0, 0, 0, 0],
'attr2': col})
expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col],
ignore_index=True)
result = melt(df, id_vars=['klass', 'col'], var_name='attribute',
value_name='value')
expected = DataFrame({0: list(range(5)) * 2,
1: pd.concat([col] * 2, ignore_index=True),
2: ['attr1'] * 5 + ['attr2'] * 5,
3: expected_value})
expected.columns = ['klass', 'col', 'attribute', 'value']
tm.assert_frame_equal(result, expected)
class TestLreshape(object):
def test_pairs(self):
data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008',
'11jan2009'],
'birthwt': [1766, 3301, 1454, 3139, 4133],
'id': [101, 102, 103, 104, 105],
'sex': ['Male', 'Female', 'Female', 'Female', 'Female'],
'visitdt1': ['11jan2009', '22dec2008', '04jan2009',
'29dec2008', '20jan2009'],
'visitdt2':
['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'],
'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'],
'wt1': [1823, 3338, 1549, 3298, 4306],
'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0],
'wt3': [2293.0, nan, nan, 3377.0, 4805.0]}
df = DataFrame(data)
spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)],
'wt': ['wt%d' % i for i in range(1, 4)]}
result = lreshape(df, spec)
exp_data = {'birthdt':
['08jan2009', '20dec2008', '30dec2008', '21dec2008',
'11jan2009', '08jan2009', '30dec2008', '21dec2008',
'11jan2009', '08jan2009', '21dec2008', '11jan2009'],
'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139,
4133, 1766, 3139, 4133],
'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101,
104, 105],
'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
'Male', 'Female', 'Female', 'Female', 'Male',
'Female', 'Female'],
'visitdt': ['11jan2009', '22dec2008', '04jan2009',
'29dec2008', '20jan2009', '21jan2009',
'22jan2009', '31dec2008', '03feb2009',
'05feb2009', '02jan2009', '15feb2009'],
'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0,
1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]}
exp = DataFrame(exp_data, columns=result.columns)
tm.assert_frame_equal(result, exp)
result = lreshape(df, spec, dropna=False)
exp_data = {'birthdt':
['08jan2009', '20dec2008', '30dec2008', '21dec2008',
'11jan2009', '08jan2009', '20dec2008', '30dec2008',
'21dec2008', '11jan2009', '08jan2009', '20dec2008',
'30dec2008', '21dec2008', '11jan2009'],
'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454,
3139, 4133, 1766, 3301, 1454, 3139, 4133],
'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105,
101, 102, 103, 104, 105],
'sex': ['Male', 'Female', 'Female', 'Female', 'Female',
'Male', 'Female', 'Female', 'Female', 'Female',
'Male', 'Female', 'Female', 'Female', 'Female'],
'visitdt': ['11jan2009', '22dec2008', '04jan2009',
'29dec2008', '20jan2009', '21jan2009', nan,
'22jan2009', '31dec2008', '03feb2009',
'05feb2009', nan, nan, '02jan2009',
'15feb2009'],
'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan,
1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0,
4805.0]}
exp = DataFrame(exp_data, columns=result.columns)
tm.assert_frame_equal(result, exp)
spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)],
'wt': ['wt%d' % i for i in range(1, 4)]}
pytest.raises(ValueError, lreshape, df, spec)
class TestWideToLong(object):
def test_simple(self):
np.random.seed(123)
x = np.random.randn(3)
df = pd.DataFrame({"A1970": {0: "a",
1: "b",
2: "c"},
"A1980": {0: "d",
1: "e",
2: "f"},
"B1970": {0: 2.5,
1: 1.2,
2: .7},
"B1980": {0: 3.2,
1: 1.3,
2: .1},
"X": dict(zip(
range(3), x))})
df["id"] = df.index
exp_data = {"X": x.tolist() + x.tolist(),
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2]}
expected = DataFrame(exp_data)
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
result = wide_to_long(df, ["A", "B"], i="id", j="year")
tm.assert_frame_equal(result, expected)
def test_stubs(self):
# GH9204
df = pd.DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]])
df.columns = ['id', 'inc1', 'inc2', 'edu1', 'edu2']
stubs = ['inc', 'edu']
# TODO: unused?
df_long = pd.wide_to_long(df, stubs, i='id', j='age') # noqa
assert stubs == ['inc', 'edu']
def test_separating_character(self):
# GH14779
np.random.seed(123)
x = np.random.randn(3)
df = pd.DataFrame({"A.1970": {0: "a",
1: "b",
2: "c"},
"A.1980": {0: "d",
1: "e",
2: "f"},
"B.1970": {0: 2.5,
1: 1.2,
2: .7},
"B.1980": {0: 3.2,
1: 1.3,
2: .1},
"X": dict(zip(
range(3), x))})
df["id"] = df.index
exp_data = {"X": x.tolist() + x.tolist(),
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2]}
expected = DataFrame(exp_data)
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
tm.assert_frame_equal(result, expected)
def test_escapable_characters(self):
np.random.seed(123)
x = np.random.randn(3)
df = pd.DataFrame({"A(quarterly)1970": {0: "a",
1: "b",
2: "c"},
"A(quarterly)1980": {0: "d",
1: "e",
2: "f"},
"B(quarterly)1970": {0: 2.5,
1: 1.2,
2: .7},
"B(quarterly)1980": {0: 3.2,
1: 1.3,
2: .1},
"X": dict(zip(
range(3), x))})
df["id"] = df.index
exp_data = {"X": x.tolist() + x.tolist(),
"A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'],
"B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2]}
expected = DataFrame(exp_data)
expected = expected.set_index(
['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]]
result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"],
i="id", j="year")
tm.assert_frame_equal(result, expected)
def test_unbalanced(self):
# test that we can have a varying amount of time variables
df = pd.DataFrame({'A2010': [1.0, 2.0],
'A2011': [3.0, 4.0],
'B2010': [5.0, 6.0],
'X': ['X1', 'X2']})
df['id'] = df.index
exp_data = {'X': ['X1', 'X1', 'X2', 'X2'],
'A': [1.0, 3.0, 2.0, 4.0],
'B': [5.0, np.nan, 6.0, np.nan],
'id': [0, 0, 1, 1],
'year': [2010, 2011, 2010, 2011]}
expected = pd.DataFrame(exp_data)
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
result = wide_to_long(df, ['A', 'B'], i='id', j='year')
tm.assert_frame_equal(result, expected)
def test_character_overlap(self):
# Test we handle overlapping characters in both id_vars and value_vars
df = pd.DataFrame({
'A11': ['a11', 'a22', 'a33'],
'A12': ['a21', 'a22', 'a23'],
'B11': ['b11', 'b12', 'b13'],
'B12': ['b21', 'b22', 'b23'],
'BB11': [1, 2, 3],
'BB12': [4, 5, 6],
'BBBX': [91, 92, 93],
'BBBZ': [91, 92, 93]
})
df['id'] = df.index
expected = pd.DataFrame({
'BBBX': [91, 92, 93, 91, 92, 93],
'BBBZ': [91, 92, 93, 91, 92, 93],
'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
'BB': [1, 2, 3, 4, 5, 6],
'id': [0, 1, 2, 0, 1, 2],
'year': [11, 11, 11, 12, 12, 12]})
expected = expected.set_index(['id', 'year'])[
['BBBX', 'BBBZ', 'A', 'B', 'BB']]
result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
tm.assert_frame_equal(result.sort_index(axis=1),
expected.sort_index(axis=1))
def test_invalid_separator(self):
# if an invalid separator is supplied a empty data frame is returned
sep = 'nope!'
df = pd.DataFrame({'A2010': [1.0, 2.0],
'A2011': [3.0, 4.0],
'B2010': [5.0, 6.0],
'X': ['X1', 'X2']})
df['id'] = df.index
exp_data = {'X': '',
'A2010': [],
'A2011': [],
'B2010': [],
'id': [],
'year': [],
'A': [],
'B': []}
expected = pd.DataFrame(exp_data).astype({'year': 'int'})
expected = expected.set_index(['id', 'year'])[[
'X', 'A2010', 'A2011', 'B2010', 'A', 'B']]
expected.index.set_levels([0, 1], level=0, inplace=True)
result = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep)
tm.assert_frame_equal(result.sort_index(axis=1),
expected.sort_index(axis=1))
def test_num_string_disambiguation(self):
# Test that we can disambiguate number value_vars from
# string value_vars
df = pd.DataFrame({
'A11': ['a11', 'a22', 'a33'],
'A12': ['a21', 'a22', 'a23'],
'B11': ['b11', 'b12', 'b13'],
'B12': ['b21', 'b22', 'b23'],
'BB11': [1, 2, 3],
'BB12': [4, 5, 6],
'Arating': [91, 92, 93],
'Arating_old': [91, 92, 93]
})
df['id'] = df.index
expected = pd.DataFrame({
'Arating': [91, 92, 93, 91, 92, 93],
'Arating_old': [91, 92, 93, 91, 92, 93],
'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
'BB': [1, 2, 3, 4, 5, 6],
'id': [0, 1, 2, 0, 1, 2],
'year': [11, 11, 11, 12, 12, 12]})
expected = expected.set_index(['id', 'year'])[
['Arating', 'Arating_old', 'A', 'B', 'BB']]
result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
tm.assert_frame_equal(result.sort_index(axis=1),
expected.sort_index(axis=1))
def test_invalid_suffixtype(self):
# If all stubs names end with a string, but a numeric suffix is
# assumed, an empty data frame is returned
df = pd.DataFrame({'Aone': [1.0, 2.0],
'Atwo': [3.0, 4.0],
'Bone': [5.0, 6.0],
'X': ['X1', 'X2']})
df['id'] = df.index
exp_data = {'X': '',
'Aone': [],
'Atwo': [],
'Bone': [],
'id': [],
'year': [],
'A': [],
'B': []}
expected = pd.DataFrame(exp_data).astype({'year': 'int'})
expected = expected.set_index(['id', 'year'])
expected.index.set_levels([0, 1], level=0, inplace=True)
result = wide_to_long(df, ['A', 'B'], i='id', j='year')
tm.assert_frame_equal(result.sort_index(axis=1),
expected.sort_index(axis=1))
def test_multiple_id_columns(self):
# Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
df = pd.DataFrame({
'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
})
expected = pd.DataFrame({
'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8,
2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9],
'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
'age': [1, 2, 1, 2, 1, 2, 1, 2, 1,
2, 1, 2, 1, 2, 1, 2, 1, 2]
})
expected = expected.set_index(['famid', 'birth', 'age'])[['ht']]
result = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age')
tm.assert_frame_equal(result, expected)
def test_non_unique_idvars(self):
# GH16382
# Raise an error message if non unique id vars (i) are passed
df = pd.DataFrame({
'A_A1': [1, 2, 3, 4, 5],
'B_B1': [1, 2, 3, 4, 5],
'x': [1, 1, 1, 1, 1]
})
with pytest.raises(ValueError):
wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname')
def test_cast_j_int(self):
df = pd.DataFrame({
'actor_1': ['CCH Pounder', 'Johnny Depp', 'Christoph Waltz'],
'actor_2': ['Joel David Moore', 'Orlando Bloom', 'Rory Kinnear'],
'actor_fb_likes_1': [1000.0, 40000.0, 11000.0],
'actor_fb_likes_2': [936.0, 5000.0, 393.0],
'title': ['Avatar', "Pirates of the Caribbean", 'Spectre']})
expected = pd.DataFrame({
'actor': ['CCH Pounder',
'Johnny Depp',
'Christoph Waltz',
'Joel David Moore',
'Orlando Bloom',
'Rory Kinnear'],
'actor_fb_likes': [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
'num': [1, 1, 1, 2, 2, 2],
'title': ['Avatar',
'Pirates of the Caribbean',
'Spectre',
'Avatar',
'Pirates of the Caribbean',
'Spectre']}).set_index(['title', 'num'])
result = wide_to_long(df, ['actor', 'actor_fb_likes'],
i='title', j='num', sep='_')
tm.assert_frame_equal(result, expected)
def test_identical_stubnames(self):
df = pd.DataFrame({'A2010': [1.0, 2.0],
'A2011': [3.0, 4.0],
'B2010': [5.0, 6.0],
'A': ['X1', 'X2']})
with pytest.raises(ValueError):
wide_to_long(df, ['A', 'B'], i='A', j='colname')
def test_nonnumeric_suffix(self):
df = pd.DataFrame({'treatment_placebo': [1.0, 2.0],
'treatment_test': [3.0, 4.0],
'result_placebo': [5.0, 6.0],
'A': ['X1', 'X2']})
expected = pd.DataFrame({
'A': ['X1', 'X1', 'X2', 'X2'],
'colname': ['placebo', 'test', 'placebo', 'test'],
'result': [5.0, np.nan, 6.0, np.nan],
'treatment': [1.0, 3.0, 2.0, 4.0]})
expected = expected.set_index(['A', 'colname'])
result = wide_to_long(df, ['result', 'treatment'],
i='A', j='colname', suffix='[a-z]+', sep='_')
tm.assert_frame_equal(result, expected)
def test_mixed_type_suffix(self):
df = pd.DataFrame({
'A': ['X1', 'X2'],
'result_1': [0, 9],
'result_foo': [5.0, 6.0],
'treatment_1': [1.0, 2.0],
'treatment_foo': [3.0, 4.0]})
expected = pd.DataFrame({
'A': ['X1', 'X2', 'X1', 'X2'],
'colname': ['1', '1', 'foo', 'foo'],
'result': [0.0, 9.0, 5.0, 6.0],
'treatment': [1.0, 2.0, 3.0, 4.0]}).set_index(['A', 'colname'])
result = wide_to_long(df, ['result', 'treatment'],
i='A', j='colname', suffix='.+', sep='_')
tm.assert_frame_equal(result, expected)
def test_float_suffix(self):
df = pd.DataFrame({
'treatment_1.1': [1.0, 2.0],
'treatment_2.1': [3.0, 4.0],
'result_1.2': [5.0, 6.0],
'result_1': [0, 9],
'A': ['X1', 'X2']})
expected = pd.DataFrame({
'A': ['X1', 'X1', 'X1', 'X1', 'X2', 'X2', 'X2', 'X2'],
'colname': [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1],
'result': [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan],
'treatment': [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0]})
expected = expected.set_index(['A', 'colname'])
result = wide_to_long(df, ['result', 'treatment'],
i='A', j='colname', suffix='[0-9.]+', sep='_')
tm.assert_frame_equal(result, expected)