# -*- coding: utf-8 -*- import pytest import numpy as np import pandas.util.testing as tm from pandas import Categorical, CategoricalIndex, Index, Series, DataFrame from pandas.core.arrays.categorical import _recode_for_categories from pandas.tests.categorical.common import TestCategorical class TestCategoricalAPI(object): def test_ordered_api(self): # GH 9347 cat1 = Categorical(list('acb'), ordered=False) tm.assert_index_equal(cat1.categories, Index(['a', 'b', 'c'])) assert not cat1.ordered cat2 = Categorical(list('acb'), categories=list('bca'), ordered=False) tm.assert_index_equal(cat2.categories, Index(['b', 'c', 'a'])) assert not cat2.ordered cat3 = Categorical(list('acb'), ordered=True) tm.assert_index_equal(cat3.categories, Index(['a', 'b', 'c'])) assert cat3.ordered cat4 = Categorical(list('acb'), categories=list('bca'), ordered=True) tm.assert_index_equal(cat4.categories, Index(['b', 'c', 'a'])) assert cat4.ordered def test_set_ordered(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) cat2 = cat.as_unordered() assert not cat2.ordered cat2 = cat.as_ordered() assert cat2.ordered cat2.as_unordered(inplace=True) assert not cat2.ordered cat2.as_ordered(inplace=True) assert cat2.ordered assert cat2.set_ordered(True).ordered assert not cat2.set_ordered(False).ordered cat2.set_ordered(True, inplace=True) assert cat2.ordered cat2.set_ordered(False, inplace=True) assert not cat2.ordered # removed in 0.19.0 msg = "can\'t set attribute" with tm.assert_raises_regex(AttributeError, msg): cat.ordered = True with tm.assert_raises_regex(AttributeError, msg): cat.ordered = False def test_rename_categories(self): cat = Categorical(["a", "b", "c", "a"]) # inplace=False: the old one must not be changed res = cat.rename_categories([1, 2, 3]) tm.assert_numpy_array_equal(res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)) tm.assert_index_equal(res.categories, Index([1, 2, 3])) exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_) tm.assert_numpy_array_equal(cat.__array__(), exp_cat) exp_cat = Index(["a", "b", "c"]) tm.assert_index_equal(cat.categories, exp_cat) # GH18862 (let rename_categories take callables) result = cat.rename_categories(lambda x: x.upper()) expected = Categorical(["A", "B", "C", "A"]) tm.assert_categorical_equal(result, expected) # and now inplace res = cat.rename_categories([1, 2, 3], inplace=True) assert res is None tm.assert_numpy_array_equal(cat.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)) tm.assert_index_equal(cat.categories, Index([1, 2, 3])) # Lengthen with pytest.raises(ValueError): cat.rename_categories([1, 2, 3, 4]) # Shorten with pytest.raises(ValueError): cat.rename_categories([1, 2]) def test_rename_categories_series(self): # https://github.com/pandas-dev/pandas/issues/17981 c = Categorical(['a', 'b']) xpr = "Treating Series 'new_categories' as a list-like " with tm.assert_produces_warning(FutureWarning) as rec: result = c.rename_categories(Series([0, 1])) assert len(rec) == 1 assert xpr in str(rec[0].message) expected = Categorical([0, 1]) tm.assert_categorical_equal(result, expected) def test_rename_categories_dict(self): # GH 17336 cat = Categorical(['a', 'b', 'c', 'd']) res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1}) expected = Index([4, 3, 2, 1]) tm.assert_index_equal(res.categories, expected) # Test for inplace res = cat.rename_categories({'a': 4, 'b': 3, 'c': 2, 'd': 1}, inplace=True) assert res is None tm.assert_index_equal(cat.categories, expected) # Test for dicts of smaller length cat = Categorical(['a', 'b', 'c', 'd']) res = cat.rename_categories({'a': 1, 'c': 3}) expected = Index([1, 'b', 3, 'd']) tm.assert_index_equal(res.categories, expected) # Test for dicts with bigger length cat = Categorical(['a', 'b', 'c', 'd']) res = cat.rename_categories({'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6}) expected = Index([1, 2, 3, 4]) tm.assert_index_equal(res.categories, expected) # Test for dicts with no items from old categories cat = Categorical(['a', 'b', 'c', 'd']) res = cat.rename_categories({'f': 1, 'g': 3}) expected = Index(['a', 'b', 'c', 'd']) tm.assert_index_equal(res.categories, expected) def test_reorder_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() new = Categorical(["a", "b", "c", "a"], categories=["c", "b", "a"], ordered=True) # first inplace == False res = cat.reorder_categories(["c", "b", "a"]) # cat must be the same as before tm.assert_categorical_equal(cat, old) # only res is changed tm.assert_categorical_equal(res, new) # inplace == True res = cat.reorder_categories(["c", "b", "a"], inplace=True) assert res is None tm.assert_categorical_equal(cat, new) # not all "old" included in "new" cat = Categorical(["a", "b", "c", "a"], ordered=True) def f(): cat.reorder_categories(["a"]) pytest.raises(ValueError, f) # still not all "old" in "new" def f(): cat.reorder_categories(["a", "b", "d"]) pytest.raises(ValueError, f) # all "old" included in "new", but too long def f(): cat.reorder_categories(["a", "b", "c", "d"]) pytest.raises(ValueError, f) def test_add_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() new = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"], ordered=True) # first inplace == False res = cat.add_categories("d") tm.assert_categorical_equal(cat, old) tm.assert_categorical_equal(res, new) res = cat.add_categories(["d"]) tm.assert_categorical_equal(cat, old) tm.assert_categorical_equal(res, new) # inplace == True res = cat.add_categories("d", inplace=True) tm.assert_categorical_equal(cat, new) assert res is None # new is in old categories def f(): cat.add_categories(["d"]) pytest.raises(ValueError, f) # GH 9927 cat = Categorical(list("abc"), ordered=True) expected = Categorical( list("abc"), categories=list("abcde"), ordered=True) # test with Series, np.array, index, list res = cat.add_categories(Series(["d", "e"])) tm.assert_categorical_equal(res, expected) res = cat.add_categories(np.array(["d", "e"])) tm.assert_categorical_equal(res, expected) res = cat.add_categories(Index(["d", "e"])) tm.assert_categorical_equal(res, expected) res = cat.add_categories(["d", "e"]) tm.assert_categorical_equal(res, expected) def test_set_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) exp_categories = Index(["c", "b", "a"]) exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_) res = cat.set_categories(["c", "b", "a"], inplace=True) tm.assert_index_equal(cat.categories, exp_categories) tm.assert_numpy_array_equal(cat.__array__(), exp_values) assert res is None res = cat.set_categories(["a", "b", "c"]) # cat must be the same as before tm.assert_index_equal(cat.categories, exp_categories) tm.assert_numpy_array_equal(cat.__array__(), exp_values) # only res is changed exp_categories_back = Index(["a", "b", "c"]) tm.assert_index_equal(res.categories, exp_categories_back) tm.assert_numpy_array_equal(res.__array__(), exp_values) # not all "old" included in "new" -> all not included ones are now # np.nan cat = Categorical(["a", "b", "c", "a"], ordered=True) res = cat.set_categories(["a"]) tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0], dtype=np.int8)) # still not all "old" in "new" res = cat.set_categories(["a", "b", "d"]) tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], dtype=np.int8)) tm.assert_index_equal(res.categories, Index(["a", "b", "d"])) # all "old" included in "new" cat = cat.set_categories(["a", "b", "c", "d"]) exp_categories = Index(["a", "b", "c", "d"]) tm.assert_index_equal(cat.categories, exp_categories) # internals... c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True) tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], dtype=np.int8)) tm.assert_index_equal(c.categories, Index([1, 2, 3, 4])) exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) tm.assert_numpy_array_equal(c.get_values(), exp) # all "pointers" to '4' must be changed from 3 to 0,... c = c.set_categories([4, 3, 2, 1]) # positions are changed tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3], dtype=np.int8)) # categories are now in new order tm.assert_index_equal(c.categories, Index([4, 3, 2, 1])) # output is the same exp = np.array([1, 2, 3, 4, 1], dtype=np.int64) tm.assert_numpy_array_equal(c.get_values(), exp) assert c.min() == 4 assert c.max() == 1 # set_categories should set the ordering if specified c2 = c.set_categories([4, 3, 2, 1], ordered=False) assert not c2.ordered tm.assert_numpy_array_equal(c.get_values(), c2.get_values()) # set_categories should pass thru the ordering c2 = c.set_ordered(False).set_categories([4, 3, 2, 1]) assert not c2.ordered tm.assert_numpy_array_equal(c.get_values(), c2.get_values()) @pytest.mark.parametrize('values, categories, new_categories', [ # No NaNs, same cats, same order (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],), # No NaNs, same cats, different order (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],), # Same, unsorted (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],), # No NaNs, same cats, different order (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],), # NaNs (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']), (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']), (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']), (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']), # Introduce NaNs (['a', 'b', 'c'], ['a', 'b'], ['a']), (['a', 'b', 'c'], ['a', 'b'], ['b']), (['b', 'a', 'c'], ['a', 'b'], ['a']), (['b', 'a', 'c'], ['a', 'b'], ['a']), # No overlap (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']), ]) @pytest.mark.parametrize('ordered', [True, False]) def test_set_categories_many(self, values, categories, new_categories, ordered): c = Categorical(values, categories) expected = Categorical(values, new_categories, ordered) result = c.set_categories(new_categories, ordered=ordered) tm.assert_categorical_equal(result, expected) def test_set_categories_private(self): cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd']) cat._set_categories(['a', 'c', 'd', 'e']) expected = Categorical(['a', 'c', 'd'], categories=list('acde')) tm.assert_categorical_equal(cat, expected) # fastpath cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd']) cat._set_categories(['a', 'c', 'd', 'e'], fastpath=True) expected = Categorical(['a', 'c', 'd'], categories=list('acde')) tm.assert_categorical_equal(cat, expected) def test_remove_categories(self): cat = Categorical(["a", "b", "c", "a"], ordered=True) old = cat.copy() new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], ordered=True) # first inplace == False res = cat.remove_categories("c") tm.assert_categorical_equal(cat, old) tm.assert_categorical_equal(res, new) res = cat.remove_categories(["c"]) tm.assert_categorical_equal(cat, old) tm.assert_categorical_equal(res, new) # inplace == True res = cat.remove_categories("c", inplace=True) tm.assert_categorical_equal(cat, new) assert res is None # removal is not in categories def f(): cat.remove_categories(["c"]) pytest.raises(ValueError, f) def test_remove_unused_categories(self): c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"]) exp_categories_all = Index(["a", "b", "c", "d", "e"]) exp_categories_dropped = Index(["a", "b", "c", "d"]) tm.assert_index_equal(c.categories, exp_categories_all) res = c.remove_unused_categories() tm.assert_index_equal(res.categories, exp_categories_dropped) tm.assert_index_equal(c.categories, exp_categories_all) res = c.remove_unused_categories(inplace=True) tm.assert_index_equal(c.categories, exp_categories_dropped) assert res is None # with NaN values (GH11599) c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"]) res = c.remove_unused_categories() tm.assert_index_equal(res.categories, Index(np.array(["a", "b", "c"]))) exp_codes = np.array([0, 1, 2, -1], dtype=np.int8) tm.assert_numpy_array_equal(res.codes, exp_codes) tm.assert_index_equal(c.categories, exp_categories_all) val = ['F', np.nan, 'D', 'B', 'D', 'F', np.nan] cat = Categorical(values=val, categories=list('ABCDEFG')) out = cat.remove_unused_categories() tm.assert_index_equal(out.categories, Index(['B', 'D', 'F'])) exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8) tm.assert_numpy_array_equal(out.codes, exp_codes) assert out.get_values().tolist() == val alpha = list('abcdefghijklmnopqrstuvwxyz') val = np.random.choice(alpha[::2], 10000).astype('object') val[np.random.choice(len(val), 100)] = np.nan cat = Categorical(values=val, categories=alpha) out = cat.remove_unused_categories() assert out.get_values().tolist() == val.tolist() class TestCategoricalAPIWithFactor(TestCategorical): def test_describe(self): # string type desc = self.factor.describe() assert self.factor.ordered exp_index = CategoricalIndex(['a', 'b', 'c'], name='categories', ordered=self.factor.ordered) expected = DataFrame({'counts': [3, 2, 3], 'freqs': [3 / 8., 2 / 8., 3 / 8.]}, index=exp_index) tm.assert_frame_equal(desc, expected) # check unused categories cat = self.factor.copy() cat.set_categories(["a", "b", "c", "d"], inplace=True) desc = cat.describe() exp_index = CategoricalIndex( list('abcd'), ordered=self.factor.ordered, name='categories') expected = DataFrame({'counts': [3, 2, 3, 0], 'freqs': [3 / 8., 2 / 8., 3 / 8., 0]}, index=exp_index) tm.assert_frame_equal(desc, expected) # check an integer one cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1]) desc = cat.describe() exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, name='categories') expected = DataFrame({'counts': [5, 3, 3], 'freqs': [5 / 11., 3 / 11., 3 / 11.]}, index=exp_index) tm.assert_frame_equal(desc, expected) # https://github.com/pandas-dev/pandas/issues/3678 # describe should work with NaN cat = Categorical([np.nan, 1, 2, 2]) desc = cat.describe() expected = DataFrame({'counts': [1, 2, 1], 'freqs': [1 / 4., 2 / 4., 1 / 4.]}, index=CategoricalIndex([1, 2, np.nan], categories=[1, 2], name='categories')) tm.assert_frame_equal(desc, expected) def test_set_categories_inplace(self): cat = self.factor.copy() cat.set_categories(['a', 'b', 'c', 'd'], inplace=True) tm.assert_index_equal(cat.categories, Index(['a', 'b', 'c', 'd'])) class TestPrivateCategoricalAPI(object): def test_codes_immutable(self): # Codes should be read only c = Categorical(["a", "b", "c", "a", np.nan]) exp = np.array([0, 1, 2, 0, -1], dtype='int8') tm.assert_numpy_array_equal(c.codes, exp) # Assignments to codes should raise def f(): c.codes = np.array([0, 1, 2, 0, 1], dtype='int8') pytest.raises(ValueError, f) # changes in the codes array should raise # np 1.6.1 raises RuntimeError rather than ValueError codes = c.codes def f(): codes[4] = 1 pytest.raises(ValueError, f) # But even after getting the codes, the original array should still be # writeable! c[4] = "a" exp = np.array([0, 1, 2, 0, 0], dtype='int8') tm.assert_numpy_array_equal(c.codes, exp) c._codes[4] = 2 exp = np.array([0, 1, 2, 0, 2], dtype='int8') tm.assert_numpy_array_equal(c.codes, exp) @pytest.mark.parametrize('codes, old, new, expected', [ ([0, 1], ['a', 'b'], ['a', 'b'], [0, 1]), ([0, 1], ['b', 'a'], ['b', 'a'], [0, 1]), ([0, 1], ['a', 'b'], ['b', 'a'], [1, 0]), ([0, 1], ['b', 'a'], ['a', 'b'], [1, 0]), ([0, 1, 0, 1], ['a', 'b'], ['a', 'b', 'c'], [0, 1, 0, 1]), ([0, 1, 2, 2], ['a', 'b', 'c'], ['a', 'b'], [0, 1, -1, -1]), ([0, 1, -1], ['a', 'b', 'c'], ['a', 'b', 'c'], [0, 1, -1]), ([0, 1, -1], ['a', 'b', 'c'], ['b'], [-1, 0, -1]), ([0, 1, -1], ['a', 'b', 'c'], ['d'], [-1, -1, -1]), ([0, 1, -1], ['a', 'b', 'c'], [], [-1, -1, -1]), ([-1, -1], [], ['a', 'b'], [-1, -1]), ([1, 0], ['b', 'a'], ['a', 'b'], [0, 1]), ]) def test_recode_to_categories(self, codes, old, new, expected): codes = np.asanyarray(codes, dtype=np.int8) expected = np.asanyarray(expected, dtype=np.int8) old = Index(old) new = Index(new) result = _recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected) def test_recode_to_categories_large(self): N = 1000 codes = np.arange(N) old = Index(codes) expected = np.arange(N - 1, -1, -1, dtype=np.int16) new = Index(expected) result = _recode_for_categories(codes, old, new) tm.assert_numpy_array_equal(result, expected)