525 lines
21 KiB
Python
525 lines
21 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
# pylint: disable-msg=W0612,E1101
|
||
|
|
||
|
from warnings import catch_warnings
|
||
|
import pytest
|
||
|
from collections import OrderedDict
|
||
|
|
||
|
from pandas import DataFrame, Series
|
||
|
import pandas as pd
|
||
|
|
||
|
from numpy import nan
|
||
|
import numpy as np
|
||
|
|
||
|
from pandas.util.testing import assert_frame_equal
|
||
|
|
||
|
from pandas import get_dummies, Categorical, Index
|
||
|
import pandas.util.testing as tm
|
||
|
from pandas.compat import u
|
||
|
|
||
|
|
||
|
class TestGetDummies(object):
|
||
|
|
||
|
@pytest.fixture
|
||
|
def df(self):
|
||
|
return DataFrame({'A': ['a', 'b', 'a'],
|
||
|
'B': ['b', 'b', 'c'],
|
||
|
'C': [1, 2, 3]})
|
||
|
|
||
|
@pytest.fixture(params=['uint8', 'i8', np.float64, bool, None])
|
||
|
def dtype(self, request):
|
||
|
return np.dtype(request.param)
|
||
|
|
||
|
@pytest.fixture(params=['dense', 'sparse'])
|
||
|
def sparse(self, request):
|
||
|
# params are strings to simplify reading test results,
|
||
|
# e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True]
|
||
|
return request.param == 'sparse'
|
||
|
|
||
|
def effective_dtype(self, dtype):
|
||
|
if dtype is None:
|
||
|
return np.uint8
|
||
|
return dtype
|
||
|
|
||
|
def test_raises_on_dtype_object(self, df):
|
||
|
with pytest.raises(ValueError):
|
||
|
get_dummies(df, dtype='object')
|
||
|
|
||
|
def test_basic(self, sparse, dtype):
|
||
|
s_list = list('abc')
|
||
|
s_series = Series(s_list)
|
||
|
s_series_index = Series(s_list, list('ABC'))
|
||
|
|
||
|
expected = DataFrame({'a': [1, 0, 0],
|
||
|
'b': [0, 1, 0],
|
||
|
'c': [0, 0, 1]},
|
||
|
dtype=self.effective_dtype(dtype))
|
||
|
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
expected.index = list('ABC')
|
||
|
result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_basic_types(self, sparse, dtype):
|
||
|
# GH 10531
|
||
|
s_list = list('abc')
|
||
|
s_series = Series(s_list)
|
||
|
s_df = DataFrame({'a': [0, 1, 0, 1, 2],
|
||
|
'b': ['A', 'A', 'B', 'C', 'C'],
|
||
|
'c': [2, 3, 3, 3, 2]})
|
||
|
|
||
|
expected = DataFrame({'a': [1, 0, 0],
|
||
|
'b': [0, 1, 0],
|
||
|
'c': [0, 0, 1]},
|
||
|
dtype=self.effective_dtype(dtype),
|
||
|
columns=list('abc'))
|
||
|
if not sparse:
|
||
|
compare = tm.assert_frame_equal
|
||
|
else:
|
||
|
expected = expected.to_sparse(fill_value=0, kind='integer')
|
||
|
compare = tm.assert_sp_frame_equal
|
||
|
|
||
|
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
|
||
|
compare(result, expected)
|
||
|
|
||
|
result = get_dummies(s_series, sparse=sparse, dtype=dtype)
|
||
|
compare(result, expected)
|
||
|
|
||
|
result = get_dummies(s_df, columns=s_df.columns,
|
||
|
sparse=sparse, dtype=dtype)
|
||
|
tm.assert_series_equal(result.get_dtype_counts(),
|
||
|
Series({dtype.name: 8}))
|
||
|
|
||
|
result = get_dummies(s_df, columns=['a'], sparse=sparse, dtype=dtype)
|
||
|
dtype_name = self.effective_dtype(dtype).name
|
||
|
|
||
|
expected_counts = {'int64': 1, 'object': 1}
|
||
|
expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)
|
||
|
|
||
|
expected = Series(expected_counts).sort_index()
|
||
|
tm.assert_series_equal(result.get_dtype_counts().sort_index(),
|
||
|
expected)
|
||
|
|
||
|
def test_just_na(self, sparse):
|
||
|
just_na_list = [np.nan]
|
||
|
just_na_series = Series(just_na_list)
|
||
|
just_na_series_index = Series(just_na_list, index=['A'])
|
||
|
|
||
|
res_list = get_dummies(just_na_list, sparse=sparse)
|
||
|
res_series = get_dummies(just_na_series, sparse=sparse)
|
||
|
res_series_index = get_dummies(just_na_series_index, sparse=sparse)
|
||
|
|
||
|
assert res_list.empty
|
||
|
assert res_series.empty
|
||
|
assert res_series_index.empty
|
||
|
|
||
|
assert res_list.index.tolist() == [0]
|
||
|
assert res_series.index.tolist() == [0]
|
||
|
assert res_series_index.index.tolist() == ['A']
|
||
|
|
||
|
def test_include_na(self, sparse, dtype):
|
||
|
if sparse:
|
||
|
pytest.xfail(reason='nan in index is problematic (GH 16894)')
|
||
|
|
||
|
s = ['a', 'b', np.nan]
|
||
|
res = get_dummies(s, sparse=sparse, dtype=dtype)
|
||
|
exp = DataFrame({'a': [1, 0, 0],
|
||
|
'b': [0, 1, 0]},
|
||
|
dtype=self.effective_dtype(dtype))
|
||
|
assert_frame_equal(res, exp)
|
||
|
|
||
|
# Sparse dataframes do not allow nan labelled columns, see #GH8822
|
||
|
res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
|
||
|
exp_na = DataFrame({nan: [0, 0, 1],
|
||
|
'a': [1, 0, 0],
|
||
|
'b': [0, 1, 0]},
|
||
|
dtype=self.effective_dtype(dtype))
|
||
|
exp_na = exp_na.reindex(['a', 'b', nan], axis=1)
|
||
|
# hack (NaN handling in assert_index_equal)
|
||
|
exp_na.columns = res_na.columns
|
||
|
assert_frame_equal(res_na, exp_na)
|
||
|
|
||
|
res_just_na = get_dummies([nan], dummy_na=True,
|
||
|
sparse=sparse, dtype=dtype)
|
||
|
exp_just_na = DataFrame(Series(1, index=[0]), columns=[nan],
|
||
|
dtype=self.effective_dtype(dtype))
|
||
|
tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
|
||
|
|
||
|
def test_unicode(self, sparse):
|
||
|
# See GH 6885 - get_dummies chokes on unicode values
|
||
|
import unicodedata
|
||
|
e = 'e'
|
||
|
eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
|
||
|
s = [e, eacute, eacute]
|
||
|
res = get_dummies(s, prefix='letter', sparse=sparse)
|
||
|
exp = DataFrame({'letter_e': [1, 0, 0],
|
||
|
u('letter_%s') % eacute: [0, 1, 1]},
|
||
|
dtype=np.uint8)
|
||
|
assert_frame_equal(res, exp)
|
||
|
|
||
|
def test_dataframe_dummies_all_obj(self, df, sparse):
|
||
|
df = df[['A', 'B']]
|
||
|
result = get_dummies(df, sparse=sparse)
|
||
|
expected = DataFrame({'A_a': [1, 0, 1],
|
||
|
'A_b': [0, 1, 0],
|
||
|
'B_b': [1, 1, 0],
|
||
|
'B_c': [0, 0, 1]},
|
||
|
dtype=np.uint8)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
|
||
|
result = get_dummies(df, sparse=sparse, dtype=dtype)
|
||
|
expected = DataFrame({'C': [1, 2, 3],
|
||
|
'A_a': [1, 0, 1],
|
||
|
'A_b': [0, 1, 0],
|
||
|
'B_b': [1, 1, 0],
|
||
|
'B_c': [0, 0, 1]})
|
||
|
cols = ['A_a', 'A_b', 'B_b', 'B_c']
|
||
|
expected[cols] = expected[cols].astype(dtype)
|
||
|
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_dataframe_dummies_prefix_list(self, df, sparse):
|
||
|
prefixes = ['from_A', 'from_B']
|
||
|
result = get_dummies(df, prefix=prefixes, sparse=sparse)
|
||
|
expected = DataFrame({'C': [1, 2, 3],
|
||
|
'from_A_a': [1, 0, 1],
|
||
|
'from_A_b': [0, 1, 0],
|
||
|
'from_B_b': [1, 1, 0],
|
||
|
'from_B_c': [0, 0, 1]},
|
||
|
dtype=np.uint8)
|
||
|
expected[['C']] = df[['C']]
|
||
|
expected = expected[['C', 'from_A_a', 'from_A_b',
|
||
|
'from_B_b', 'from_B_c']]
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_dataframe_dummies_prefix_str(self, df, sparse):
|
||
|
# not that you should do this...
|
||
|
result = get_dummies(df, prefix='bad', sparse=sparse)
|
||
|
bad_columns = ['bad_a', 'bad_b', 'bad_b', 'bad_c']
|
||
|
expected = DataFrame([[1, 1, 0, 1, 0],
|
||
|
[2, 0, 1, 1, 0],
|
||
|
[3, 1, 0, 0, 1]],
|
||
|
columns=['C'] + bad_columns,
|
||
|
dtype=np.uint8)
|
||
|
expected = expected.astype({"C": np.int64})
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_dataframe_dummies_subset(self, df, sparse):
|
||
|
result = get_dummies(df, prefix=['from_A'], columns=['A'],
|
||
|
sparse=sparse)
|
||
|
expected = DataFrame({'B': ['b', 'b', 'c'],
|
||
|
'C': [1, 2, 3],
|
||
|
'from_A_a': [1, 0, 1],
|
||
|
'from_A_b': [0, 1, 0]}, dtype=np.uint8)
|
||
|
expected[['C']] = df[['C']]
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_dataframe_dummies_prefix_sep(self, df, sparse):
|
||
|
result = get_dummies(df, prefix_sep='..', sparse=sparse)
|
||
|
expected = DataFrame({'C': [1, 2, 3],
|
||
|
'A..a': [1, 0, 1],
|
||
|
'A..b': [0, 1, 0],
|
||
|
'B..b': [1, 1, 0],
|
||
|
'B..c': [0, 0, 1]},
|
||
|
dtype=np.uint8)
|
||
|
expected[['C']] = df[['C']]
|
||
|
expected = expected[['C', 'A..a', 'A..b', 'B..b', 'B..c']]
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = get_dummies(df, prefix_sep=['..', '__'], sparse=sparse)
|
||
|
expected = expected.rename(columns={'B..b': 'B__b', 'B..c': 'B__c'})
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = get_dummies(df, prefix_sep={'A': '..', 'B': '__'},
|
||
|
sparse=sparse)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
|
||
|
with pytest.raises(ValueError):
|
||
|
get_dummies(df, prefix=['too few'], sparse=sparse)
|
||
|
|
||
|
def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
|
||
|
with pytest.raises(ValueError):
|
||
|
get_dummies(df, prefix_sep=['bad'], sparse=sparse)
|
||
|
|
||
|
def test_dataframe_dummies_prefix_dict(self, sparse):
|
||
|
prefixes = {'A': 'from_A', 'B': 'from_B'}
|
||
|
df = DataFrame({'C': [1, 2, 3],
|
||
|
'A': ['a', 'b', 'a'],
|
||
|
'B': ['b', 'b', 'c']})
|
||
|
result = get_dummies(df, prefix=prefixes, sparse=sparse)
|
||
|
|
||
|
expected = DataFrame({'C': [1, 2, 3],
|
||
|
'from_A_a': [1, 0, 1],
|
||
|
'from_A_b': [0, 1, 0],
|
||
|
'from_B_b': [1, 1, 0],
|
||
|
'from_B_c': [0, 0, 1]})
|
||
|
|
||
|
columns = ['from_A_a', 'from_A_b', 'from_B_b', 'from_B_c']
|
||
|
expected[columns] = expected[columns].astype(np.uint8)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_dataframe_dummies_with_na(self, df, sparse, dtype):
|
||
|
df.loc[3, :] = [np.nan, np.nan, np.nan]
|
||
|
result = get_dummies(df, dummy_na=True,
|
||
|
sparse=sparse, dtype=dtype).sort_index(axis=1)
|
||
|
expected = DataFrame({'C': [1, 2, 3, np.nan],
|
||
|
'A_a': [1, 0, 1, 0],
|
||
|
'A_b': [0, 1, 0, 0],
|
||
|
'A_nan': [0, 0, 0, 1],
|
||
|
'B_b': [1, 1, 0, 0],
|
||
|
'B_c': [0, 0, 1, 0],
|
||
|
'B_nan': [0, 0, 0, 1]}).sort_index(axis=1)
|
||
|
|
||
|
e_dtype = self.effective_dtype(dtype)
|
||
|
columns = ['A_a', 'A_b', 'A_nan', 'B_b', 'B_c', 'B_nan']
|
||
|
expected[columns] = expected[columns].astype(e_dtype)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
|
||
|
expected = expected[['C', 'A_a', 'A_b', 'B_b', 'B_c']]
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
|
||
|
df['cat'] = pd.Categorical(['x', 'y', 'y'])
|
||
|
result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
|
||
|
expected = DataFrame({'C': [1, 2, 3],
|
||
|
'A_a': [1, 0, 1],
|
||
|
'A_b': [0, 1, 0],
|
||
|
'B_b': [1, 1, 0],
|
||
|
'B_c': [0, 0, 1],
|
||
|
'cat_x': [1, 0, 0],
|
||
|
'cat_y': [0, 1, 1]}).sort_index(axis=1)
|
||
|
|
||
|
columns = ['A_a', 'A_b', 'B_b', 'B_c', 'cat_x', 'cat_y']
|
||
|
effective_dtype = self.effective_dtype(dtype)
|
||
|
expected[columns] = expected[columns].astype(effective_dtype)
|
||
|
expected.sort_index(axis=1)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_basic_drop_first(self, sparse):
|
||
|
# GH12402 Add a new parameter `drop_first` to avoid collinearity
|
||
|
# Basic case
|
||
|
s_list = list('abc')
|
||
|
s_series = Series(s_list)
|
||
|
s_series_index = Series(s_list, list('ABC'))
|
||
|
|
||
|
expected = DataFrame({'b': [0, 1, 0],
|
||
|
'c': [0, 0, 1]},
|
||
|
dtype=np.uint8)
|
||
|
|
||
|
result = get_dummies(s_list, drop_first=True, sparse=sparse)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = get_dummies(s_series, drop_first=True, sparse=sparse)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
expected.index = list('ABC')
|
||
|
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_basic_drop_first_one_level(self, sparse):
|
||
|
# Test the case that categorical variable only has one level.
|
||
|
s_list = list('aaa')
|
||
|
s_series = Series(s_list)
|
||
|
s_series_index = Series(s_list, list('ABC'))
|
||
|
|
||
|
expected = DataFrame(index=np.arange(3))
|
||
|
|
||
|
result = get_dummies(s_list, drop_first=True, sparse=sparse)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = get_dummies(s_series, drop_first=True, sparse=sparse)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
expected = DataFrame(index=list('ABC'))
|
||
|
result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_basic_drop_first_NA(self, sparse):
|
||
|
# Test NA handling together with drop_first
|
||
|
s_NA = ['a', 'b', np.nan]
|
||
|
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
|
||
|
exp = DataFrame({'b': [0, 1, 0]}, dtype=np.uint8)
|
||
|
assert_frame_equal(res, exp)
|
||
|
|
||
|
res_na = get_dummies(s_NA, dummy_na=True, drop_first=True,
|
||
|
sparse=sparse)
|
||
|
exp_na = DataFrame(
|
||
|
{'b': [0, 1, 0],
|
||
|
nan: [0, 0, 1]},
|
||
|
dtype=np.uint8).reindex(['b', nan], axis=1)
|
||
|
assert_frame_equal(res_na, exp_na)
|
||
|
|
||
|
res_just_na = get_dummies([nan], dummy_na=True, drop_first=True,
|
||
|
sparse=sparse)
|
||
|
exp_just_na = DataFrame(index=np.arange(1))
|
||
|
assert_frame_equal(res_just_na, exp_just_na)
|
||
|
|
||
|
def test_dataframe_dummies_drop_first(self, df, sparse):
|
||
|
df = df[['A', 'B']]
|
||
|
result = get_dummies(df, drop_first=True, sparse=sparse)
|
||
|
expected = DataFrame({'A_b': [0, 1, 0],
|
||
|
'B_c': [0, 0, 1]},
|
||
|
dtype=np.uint8)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_dataframe_dummies_drop_first_with_categorical(
|
||
|
self, df, sparse, dtype):
|
||
|
df['cat'] = pd.Categorical(['x', 'y', 'y'])
|
||
|
result = get_dummies(df, drop_first=True, sparse=sparse)
|
||
|
expected = DataFrame({'C': [1, 2, 3],
|
||
|
'A_b': [0, 1, 0],
|
||
|
'B_c': [0, 0, 1],
|
||
|
'cat_y': [0, 1, 1]})
|
||
|
cols = ['A_b', 'B_c', 'cat_y']
|
||
|
expected[cols] = expected[cols].astype(np.uint8)
|
||
|
expected = expected[['C', 'A_b', 'B_c', 'cat_y']]
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
|
||
|
df.loc[3, :] = [np.nan, np.nan, np.nan]
|
||
|
result = get_dummies(df, dummy_na=True, drop_first=True,
|
||
|
sparse=sparse).sort_index(axis=1)
|
||
|
expected = DataFrame({'C': [1, 2, 3, np.nan],
|
||
|
'A_b': [0, 1, 0, 0],
|
||
|
'A_nan': [0, 0, 0, 1],
|
||
|
'B_c': [0, 0, 1, 0],
|
||
|
'B_nan': [0, 0, 0, 1]})
|
||
|
cols = ['A_b', 'A_nan', 'B_c', 'B_nan']
|
||
|
expected[cols] = expected[cols].astype(np.uint8)
|
||
|
expected = expected.sort_index(axis=1)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = get_dummies(df, dummy_na=False, drop_first=True,
|
||
|
sparse=sparse)
|
||
|
expected = expected[['C', 'A_b', 'B_c']]
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_int_int(self):
|
||
|
data = Series([1, 2, 1])
|
||
|
result = pd.get_dummies(data)
|
||
|
expected = DataFrame([[1, 0],
|
||
|
[0, 1],
|
||
|
[1, 0]],
|
||
|
columns=[1, 2],
|
||
|
dtype=np.uint8)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
data = Series(pd.Categorical(['a', 'b', 'a']))
|
||
|
result = pd.get_dummies(data)
|
||
|
expected = DataFrame([[1, 0],
|
||
|
[0, 1],
|
||
|
[1, 0]],
|
||
|
columns=pd.Categorical(['a', 'b']),
|
||
|
dtype=np.uint8)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_int_df(self, dtype):
|
||
|
data = DataFrame(
|
||
|
{'A': [1, 2, 1],
|
||
|
'B': pd.Categorical(['a', 'b', 'a']),
|
||
|
'C': [1, 2, 1],
|
||
|
'D': [1., 2., 1.]
|
||
|
}
|
||
|
)
|
||
|
columns = ['C', 'D', 'A_1', 'A_2', 'B_a', 'B_b']
|
||
|
expected = DataFrame([
|
||
|
[1, 1., 1, 0, 1, 0],
|
||
|
[2, 2., 0, 1, 0, 1],
|
||
|
[1, 1., 1, 0, 1, 0]
|
||
|
], columns=columns)
|
||
|
expected[columns[2:]] = expected[columns[2:]].astype(dtype)
|
||
|
result = pd.get_dummies(data, columns=['A', 'B'], dtype=dtype)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):
|
||
|
# GH13854
|
||
|
for ordered in [False, True]:
|
||
|
cat = pd.Categorical(list("xy"), categories=list("xyz"),
|
||
|
ordered=ordered)
|
||
|
result = get_dummies(cat, dtype=dtype)
|
||
|
|
||
|
data = np.array([[1, 0, 0], [0, 1, 0]],
|
||
|
dtype=self.effective_dtype(dtype))
|
||
|
cols = pd.CategoricalIndex(cat.categories,
|
||
|
categories=cat.categories,
|
||
|
ordered=ordered)
|
||
|
expected = DataFrame(data, columns=cols,
|
||
|
dtype=self.effective_dtype(dtype))
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
@pytest.mark.parametrize('sparse', [True, False])
|
||
|
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
|
||
|
# GH18914
|
||
|
df = DataFrame.from_dict(OrderedDict([('GDP', [1, 2]),
|
||
|
('Nation', ['AB', 'CD'])]))
|
||
|
df = get_dummies(df, columns=['Nation'], sparse=sparse)
|
||
|
df2 = df.reindex(columns=['GDP'])
|
||
|
|
||
|
tm.assert_frame_equal(df[['GDP']], df2)
|
||
|
|
||
|
def test_get_dummies_duplicate_columns(self, df):
|
||
|
# GH20839
|
||
|
df.columns = ["A", "A", "A"]
|
||
|
result = get_dummies(df).sort_index(axis=1)
|
||
|
|
||
|
expected = DataFrame([[1, 1, 0, 1, 0],
|
||
|
[2, 0, 1, 1, 0],
|
||
|
[3, 1, 0, 0, 1]],
|
||
|
columns=['A', 'A_a', 'A_b', 'A_b', 'A_c'],
|
||
|
dtype=np.uint8).sort_index(axis=1)
|
||
|
|
||
|
expected = expected.astype({"A": np.int64})
|
||
|
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
class TestCategoricalReshape(object):
|
||
|
|
||
|
def test_reshaping_panel_categorical(self):
|
||
|
|
||
|
with catch_warnings(record=True):
|
||
|
p = tm.makePanel()
|
||
|
p['str'] = 'foo'
|
||
|
df = p.to_frame()
|
||
|
|
||
|
df['category'] = df['str'].astype('category')
|
||
|
result = df['category'].unstack()
|
||
|
|
||
|
c = Categorical(['foo'] * len(p.major_axis))
|
||
|
expected = DataFrame({'A': c.copy(),
|
||
|
'B': c.copy(),
|
||
|
'C': c.copy(),
|
||
|
'D': c.copy()},
|
||
|
columns=Index(list('ABCD'), name='minor'),
|
||
|
index=p.major_axis.set_names('major'))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
|
||
|
class TestMakeAxisDummies(object):
|
||
|
|
||
|
def test_preserve_categorical_dtype(self):
|
||
|
# GH13854
|
||
|
for ordered in [False, True]:
|
||
|
cidx = pd.CategoricalIndex(list("xyz"), ordered=ordered)
|
||
|
midx = pd.MultiIndex(levels=[['a'], cidx],
|
||
|
labels=[[0, 0], [0, 1]])
|
||
|
df = DataFrame([[10, 11]], index=midx)
|
||
|
|
||
|
expected = DataFrame([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
|
||
|
index=midx, columns=cidx)
|
||
|
|
||
|
from pandas.core.reshape.reshape import make_axis_dummies
|
||
|
result = make_axis_dummies(df)
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = make_axis_dummies(df, transform=lambda x: x)
|
||
|
tm.assert_frame_equal(result, expected)
|