# pylint: disable-msg=E1101,W0612 import operator import pytest from warnings import catch_warnings from numpy import nan import numpy as np import pandas as pd from pandas import Series, DataFrame, bdate_range, Panel from pandas.core.indexes.datetimes import DatetimeIndex from pandas.tseries.offsets import BDay from pandas.util import testing as tm from pandas.compat import lrange from pandas import compat from pandas.core.sparse import frame as spf from pandas._libs.sparse import BlockIndex, IntIndex from pandas.core.sparse.api import SparseSeries, SparseDataFrame, SparseArray from pandas.tests.frame.test_api import SharedWithSparse class TestSparseDataFrame(SharedWithSparse): klass = SparseDataFrame # SharedWithSparse tests use generic, klass-agnostic assertion _assert_frame_equal = staticmethod(tm.assert_sp_frame_equal) _assert_series_equal = staticmethod(tm.assert_sp_series_equal) def setup_method(self, method): self.data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], 'C': np.arange(10, dtype=np.float64), 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} self.dates = bdate_range('1/1/2011', periods=10) self.orig = pd.DataFrame(self.data, index=self.dates) self.iorig = pd.DataFrame(self.data, index=self.dates) self.frame = SparseDataFrame(self.data, index=self.dates) self.iframe = SparseDataFrame(self.data, index=self.dates, default_kind='integer') self.mixed_frame = self.frame.copy(False) self.mixed_frame['foo'] = pd.SparseArray(['bar'] * len(self.dates)) values = self.frame.values.copy() values[np.isnan(values)] = 0 self.zorig = pd.DataFrame(values, columns=['A', 'B', 'C', 'D'], index=self.dates) self.zframe = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], default_fill_value=0, index=self.dates) values = self.frame.values.copy() values[np.isnan(values)] = 2 self.fill_orig = pd.DataFrame(values, columns=['A', 'B', 'C', 'D'], index=self.dates) self.fill_frame = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], default_fill_value=2, index=self.dates) self.empty = SparseDataFrame() def test_fill_value_when_combine_const(self): # GH12723 dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float') df = SparseDataFrame({'foo': dat}, index=range(6)) exp = df.fillna(0).add(2) res = df.add(2, fill_value=0) tm.assert_sp_frame_equal(res, exp) def test_values(self): empty = self.empty.values assert empty.shape == (0, 0) no_cols = SparseDataFrame(index=np.arange(10)) mat = no_cols.values assert mat.shape == (10, 0) no_index = SparseDataFrame(columns=np.arange(10)) mat = no_index.values assert mat.shape == (0, 10) def test_copy(self): cp = self.frame.copy() assert isinstance(cp, SparseDataFrame) tm.assert_sp_frame_equal(cp, self.frame) # as of v0.15.0 # this is now identical (but not is_a ) assert cp.index.identical(self.frame.index) def test_constructor(self): for col, series in compat.iteritems(self.frame): assert isinstance(series, SparseSeries) assert isinstance(self.iframe['A'].sp_index, IntIndex) # constructed zframe from matrix above assert self.zframe['A'].fill_value == 0 tm.assert_numpy_array_equal(pd.SparseArray([1., 2., 3., 4., 5., 6.]), self.zframe['A'].values) tm.assert_numpy_array_equal(np.array([0., 0., 0., 0., 1., 2., 3., 4., 5., 6.]), self.zframe['A'].to_dense().values) # construct no data sdf = SparseDataFrame(columns=np.arange(10), index=np.arange(10)) for col, series in compat.iteritems(sdf): assert isinstance(series, SparseSeries) # construct from nested dict data = {} for c, s in compat.iteritems(self.frame): data[c] = s.to_dict() sdf = SparseDataFrame(data) tm.assert_sp_frame_equal(sdf, self.frame) # TODO: test data is copied from inputs # init dict with different index idx = self.frame.index[:5] cons = SparseDataFrame( self.frame, index=idx, columns=self.frame.columns, default_fill_value=self.frame.default_fill_value, default_kind=self.frame.default_kind, copy=True) reindexed = self.frame.reindex(idx) tm.assert_sp_frame_equal(cons, reindexed, exact_indices=False) # assert level parameter breaks reindex with pytest.raises(TypeError): self.frame.reindex(idx, level=0) repr(self.frame) def test_constructor_dict_order(self): # GH19018 # initialization ordering: by insertion order if python>= 3.6, else # order by value d = {'b': [2, 3], 'a': [0, 1]} frame = SparseDataFrame(data=d) if compat.PY36: expected = SparseDataFrame(data=d, columns=list('ba')) else: expected = SparseDataFrame(data=d, columns=list('ab')) tm.assert_sp_frame_equal(frame, expected) def test_constructor_ndarray(self): # no index or columns sp = SparseDataFrame(self.frame.values) # 1d sp = SparseDataFrame(self.data['A'], index=self.dates, columns=['A']) tm.assert_sp_frame_equal(sp, self.frame.reindex(columns=['A'])) # raise on level argument pytest.raises(TypeError, self.frame.reindex, columns=['A'], level=1) # wrong length index / columns with tm.assert_raises_regex(ValueError, "^Index length"): SparseDataFrame(self.frame.values, index=self.frame.index[:-1]) with tm.assert_raises_regex(ValueError, "^Column length"): SparseDataFrame(self.frame.values, columns=self.frame.columns[:-1]) # GH 9272 def test_constructor_empty(self): sp = SparseDataFrame() assert len(sp.index) == 0 assert len(sp.columns) == 0 def test_constructor_dataframe(self): dense = self.frame.to_dense() sp = SparseDataFrame(dense) tm.assert_sp_frame_equal(sp, self.frame) def test_constructor_convert_index_once(self): arr = np.array([1.5, 2.5, 3.5]) sdf = SparseDataFrame(columns=lrange(4), index=arr) assert sdf[0].index is sdf[1].index def test_constructor_from_series(self): # GH 2873 x = Series(np.random.randn(10000), name='a') x = x.to_sparse(fill_value=0) assert isinstance(x, SparseSeries) df = SparseDataFrame(x) assert isinstance(df, SparseDataFrame) x = Series(np.random.randn(10000), name='a') y = Series(np.random.randn(10000), name='b') x2 = x.astype(float) x2.loc[:9998] = np.NaN # TODO: x_sparse is unused...fix x_sparse = x2.to_sparse(fill_value=np.NaN) # noqa # Currently fails too with weird ufunc error # df1 = SparseDataFrame([x_sparse, y]) y.loc[:9998] = 0 # TODO: y_sparse is unsused...fix y_sparse = y.to_sparse(fill_value=0) # noqa # without sparse value raises error # df2 = SparseDataFrame([x2_sparse, y]) def test_constructor_from_dense_series(self): # GH 19393 # series with name x = Series(np.random.randn(10000), name='a') result = SparseDataFrame(x) expected = x.to_frame().to_sparse() tm.assert_sp_frame_equal(result, expected) # series with no name x = Series(np.random.randn(10000)) result = SparseDataFrame(x) expected = x.to_frame().to_sparse() tm.assert_sp_frame_equal(result, expected) def test_constructor_from_unknown_type(self): # GH 19393 class Unknown(object): pass with pytest.raises(TypeError, message='SparseDataFrame called with unknown type ' '"Unknown" for data argument'): SparseDataFrame(Unknown()) def test_constructor_preserve_attr(self): # GH 13866 arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0) assert arr.dtype == np.int64 assert arr.fill_value == 0 df = pd.SparseDataFrame({'x': arr}) assert df['x'].dtype == np.int64 assert df['x'].fill_value == 0 s = pd.SparseSeries(arr, name='x') assert s.dtype == np.int64 assert s.fill_value == 0 df = pd.SparseDataFrame(s) assert df['x'].dtype == np.int64 assert df['x'].fill_value == 0 df = pd.SparseDataFrame({'x': s}) assert df['x'].dtype == np.int64 assert df['x'].fill_value == 0 def test_constructor_nan_dataframe(self): # GH 10079 trains = np.arange(100) thresholds = [10, 20, 30, 40, 50, 60] tuples = [(i, j) for i in trains for j in thresholds] index = pd.MultiIndex.from_tuples(tuples, names=['trains', 'thresholds']) matrix = np.empty((len(index), len(trains))) matrix.fill(np.nan) df = pd.DataFrame(matrix, index=index, columns=trains, dtype=float) result = df.to_sparse() expected = pd.SparseDataFrame(matrix, index=index, columns=trains, dtype=float) tm.assert_sp_frame_equal(result, expected) def test_type_coercion_at_construction(self): # GH 15682 result = pd.SparseDataFrame( {'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype='uint8', default_fill_value=0) expected = pd.SparseDataFrame( {'a': pd.SparseSeries([1, 0, 0], dtype='uint8'), 'b': pd.SparseSeries([0, 1, 0], dtype='uint8'), 'c': pd.SparseSeries([0, 0, 1], dtype='uint8')}, default_fill_value=0) tm.assert_sp_frame_equal(result, expected) def test_dtypes(self): df = DataFrame(np.random.randn(10000, 4)) df.loc[:9998] = np.nan sdf = df.to_sparse() result = sdf.get_dtype_counts() expected = Series({'float64': 4}) tm.assert_series_equal(result, expected) def test_shape(self): # see gh-10452 assert self.frame.shape == (10, 4) assert self.iframe.shape == (10, 4) assert self.zframe.shape == (10, 4) assert self.fill_frame.shape == (10, 4) def test_str(self): df = DataFrame(np.random.randn(10000, 4)) df.loc[:9998] = np.nan sdf = df.to_sparse() str(sdf) def test_array_interface(self): res = np.sqrt(self.frame) dres = np.sqrt(self.frame.to_dense()) tm.assert_frame_equal(res.to_dense(), dres) def test_pickle(self): def _test_roundtrip(frame, orig): result = tm.round_trip_pickle(frame) tm.assert_sp_frame_equal(frame, result) tm.assert_frame_equal(result.to_dense(), orig, check_dtype=False) _test_roundtrip(SparseDataFrame(), DataFrame()) self._check_all(_test_roundtrip) def test_dense_to_sparse(self): df = DataFrame({'A': [nan, nan, nan, 1, 2], 'B': [1, 2, nan, nan, nan]}) sdf = df.to_sparse() assert isinstance(sdf, SparseDataFrame) assert np.isnan(sdf.default_fill_value) assert isinstance(sdf['A'].sp_index, BlockIndex) tm.assert_frame_equal(sdf.to_dense(), df) sdf = df.to_sparse(kind='integer') assert isinstance(sdf['A'].sp_index, IntIndex) df = DataFrame({'A': [0, 0, 0, 1, 2], 'B': [1, 2, 0, 0, 0]}, dtype=float) sdf = df.to_sparse(fill_value=0) assert sdf.default_fill_value == 0 tm.assert_frame_equal(sdf.to_dense(), df) def test_density(self): df = SparseSeries([nan, nan, nan, 0, 1, 2, 3, 4, 5, 6]) assert df.density == 0.7 df = SparseDataFrame({'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], 'C': np.arange(10), 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}) assert df.density == 0.75 def test_sparse_to_dense(self): pass def test_sparse_series_ops(self): self._check_frame_ops(self.frame) def test_sparse_series_ops_i(self): self._check_frame_ops(self.iframe) def test_sparse_series_ops_z(self): self._check_frame_ops(self.zframe) def test_sparse_series_ops_fill(self): self._check_frame_ops(self.fill_frame) def _check_frame_ops(self, frame): def _compare_to_dense(a, b, da, db, op): sparse_result = op(a, b) dense_result = op(da, db) fill = sparse_result.default_fill_value dense_result = dense_result.to_sparse(fill_value=fill) tm.assert_sp_frame_equal(sparse_result, dense_result, exact_indices=False) if isinstance(a, DataFrame) and isinstance(db, DataFrame): mixed_result = op(a, db) assert isinstance(mixed_result, SparseDataFrame) tm.assert_sp_frame_equal(mixed_result, sparse_result, exact_indices=False) opnames = ['add', 'sub', 'mul', 'truediv', 'floordiv'] ops = [getattr(operator, name) for name in opnames] fidx = frame.index # time series operations series = [frame['A'], frame['B'], frame['C'], frame['D'], frame['A'].reindex(fidx[:7]), frame['A'].reindex(fidx[::2]), SparseSeries( [], index=[])] for op in opnames: _compare_to_dense(frame, frame[::2], frame.to_dense(), frame[::2].to_dense(), getattr(operator, op)) # 2304, no auto-broadcasting for i, s in enumerate(series): f = lambda a, b: getattr(a, op)(b, axis='index') _compare_to_dense(frame, s, frame.to_dense(), s.to_dense(), f) # rops are not implemented # _compare_to_dense(s, frame, s.to_dense(), # frame.to_dense(), f) # cross-sectional operations series = [frame.xs(fidx[0]), frame.xs(fidx[3]), frame.xs(fidx[5]), frame.xs(fidx[7]), frame.xs(fidx[5])[:2]] for op in ops: for s in series: _compare_to_dense(frame, s, frame.to_dense(), s, op) _compare_to_dense(s, frame, s, frame.to_dense(), op) # it works! result = self.frame + self.frame.loc[:, ['A', 'B']] # noqa def test_op_corners(self): empty = self.empty + self.empty assert empty.empty foo = self.frame + self.empty assert isinstance(foo.index, DatetimeIndex) tm.assert_frame_equal(foo, self.frame * np.nan) foo = self.empty + self.frame tm.assert_frame_equal(foo, self.frame * np.nan) def test_scalar_ops(self): pass def test_getitem(self): # 1585 select multiple columns sdf = SparseDataFrame(index=[0, 1, 2], columns=['a', 'b', 'c']) result = sdf[['a', 'b']] exp = sdf.reindex(columns=['a', 'b']) tm.assert_sp_frame_equal(result, exp) pytest.raises(Exception, sdf.__getitem__, ['a', 'd']) def test_iloc(self): # 2227 result = self.frame.iloc[:, 0] assert isinstance(result, SparseSeries) tm.assert_sp_series_equal(result, self.frame['A']) # preserve sparse index type. #2251 data = {'A': [0, 1]} iframe = SparseDataFrame(data, default_kind='integer') tm.assert_class_equal(iframe['A'].sp_index, iframe.iloc[:, 0].sp_index) def test_set_value(self): # ok, as the index gets converted to object frame = self.frame.copy() with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): res = frame.set_value('foobar', 'B', 1.5) assert res.index.dtype == 'object' res = self.frame res.index = res.index.astype(object) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): res = self.frame.set_value('foobar', 'B', 1.5) assert res is not self.frame assert res.index[-1] == 'foobar' with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): assert res.get_value('foobar', 'B') == 1.5 with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): res2 = res.set_value('foobar', 'qux', 1.5) assert res2 is not res tm.assert_index_equal(res2.columns, pd.Index(list(self.frame.columns) + ['qux'])) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): assert res2.get_value('foobar', 'qux') == 1.5 def test_fancy_index_misc(self): # axis = 0 sliced = self.frame.iloc[-2:, :] expected = self.frame.reindex(index=self.frame.index[-2:]) tm.assert_sp_frame_equal(sliced, expected) # axis = 1 sliced = self.frame.iloc[:, -2:] expected = self.frame.reindex(columns=self.frame.columns[-2:]) tm.assert_sp_frame_equal(sliced, expected) def test_getitem_overload(self): # slicing sl = self.frame[:20] tm.assert_sp_frame_equal(sl, self.frame.reindex(self.frame.index[:20])) # boolean indexing d = self.frame.index[5] indexer = self.frame.index > d subindex = self.frame.index[indexer] subframe = self.frame[indexer] tm.assert_index_equal(subindex, subframe.index) pytest.raises(Exception, self.frame.__getitem__, indexer[:-1]) def test_setitem(self): def _check_frame(frame, orig): N = len(frame) # insert SparseSeries frame['E'] = frame['A'] assert isinstance(frame['E'], SparseSeries) tm.assert_sp_series_equal(frame['E'], frame['A'], check_names=False) # insert SparseSeries differently-indexed to_insert = frame['A'][::2] frame['E'] = to_insert expected = to_insert.to_dense().reindex(frame.index) result = frame['E'].to_dense() tm.assert_series_equal(result, expected, check_names=False) assert result.name == 'E' # insert Series frame['F'] = frame['A'].to_dense() assert isinstance(frame['F'], SparseSeries) tm.assert_sp_series_equal(frame['F'], frame['A'], check_names=False) # insert Series differently-indexed to_insert = frame['A'].to_dense()[::2] frame['G'] = to_insert expected = to_insert.reindex(frame.index) expected.name = 'G' tm.assert_series_equal(frame['G'].to_dense(), expected) # insert ndarray frame['H'] = np.random.randn(N) assert isinstance(frame['H'], SparseSeries) to_sparsify = np.random.randn(N) to_sparsify[N // 2:] = frame.default_fill_value frame['I'] = to_sparsify assert len(frame['I'].sp_values) == N // 2 # insert ndarray wrong size pytest.raises(Exception, frame.__setitem__, 'foo', np.random.randn(N - 1)) # scalar value frame['J'] = 5 assert len(frame['J'].sp_values) == N assert (frame['J'].sp_values == 5).all() frame['K'] = frame.default_fill_value assert len(frame['K'].sp_values) == 0 self._check_all(_check_frame) def test_setitem_corner(self): self.frame['a'] = self.frame['B'] tm.assert_sp_series_equal(self.frame['a'], self.frame['B'], check_names=False) def test_setitem_array(self): arr = self.frame['B'] self.frame['E'] = arr tm.assert_sp_series_equal(self.frame['E'], self.frame['B'], check_names=False) self.frame['F'] = arr[:-1] index = self.frame.index[:-1] tm.assert_sp_series_equal(self.frame['E'].reindex(index), self.frame['F'].reindex(index), check_names=False) def test_setitem_chained_no_consolidate(self): # https://github.com/pandas-dev/pandas/pull/19268 # issuecomment-361696418 # chained setitem used to cause consolidation sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]]) with pd.option_context('mode.chained_assignment', None): sdf[0][1] = 2 assert len(sdf._data.blocks) == 2 def test_delitem(self): A = self.frame['A'] C = self.frame['C'] del self.frame['B'] assert 'B' not in self.frame tm.assert_sp_series_equal(self.frame['A'], A) tm.assert_sp_series_equal(self.frame['C'], C) del self.frame['D'] assert 'D' not in self.frame del self.frame['A'] assert 'A' not in self.frame def test_set_columns(self): self.frame.columns = self.frame.columns pytest.raises(Exception, setattr, self.frame, 'columns', self.frame.columns[:-1]) def test_set_index(self): self.frame.index = self.frame.index pytest.raises(Exception, setattr, self.frame, 'index', self.frame.index[:-1]) def test_append(self): a = self.frame[:5] b = self.frame[5:] appended = a.append(b) tm.assert_sp_frame_equal(appended, self.frame, exact_indices=False) a = self.frame.iloc[:5, :3] b = self.frame.iloc[5:] with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # Stacklevel is set for pd.concat, not append appended = a.append(b) tm.assert_sp_frame_equal(appended.iloc[:, :3], self.frame.iloc[:, :3], exact_indices=False) a = a[['B', 'C', 'A']].head(2) b = b.head(2) expected = pd.SparseDataFrame({ "B": [0., 1, None, 3], "C": [0., 1, 5, 6], "A": [None, None, 2, 3], "D": [None, None, 5, None], }, index=a.index | b.index, columns=['B', 'C', 'A', 'D']) with tm.assert_produces_warning(None): appended = a.append(b, sort=False) tm.assert_frame_equal(appended, expected) with tm.assert_produces_warning(None): appended = a.append(b, sort=True) tm.assert_sp_frame_equal(appended, expected[['A', 'B', 'C', 'D']]) def test_astype(self): sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4], dtype=np.int64), 'B': SparseArray([4, 5, 6, 7], dtype=np.int64)}) assert sparse['A'].dtype == np.int64 assert sparse['B'].dtype == np.int64 res = sparse.astype(np.float64) exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.], fill_value=0.), 'B': SparseArray([4., 5., 6., 7.], fill_value=0.)}, default_fill_value=np.nan) tm.assert_sp_frame_equal(res, exp) assert res['A'].dtype == np.float64 assert res['B'].dtype == np.float64 sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4], dtype=np.int64), 'B': SparseArray([0, 5, 0, 7], dtype=np.int64)}, default_fill_value=0) assert sparse['A'].dtype == np.int64 assert sparse['B'].dtype == np.int64 res = sparse.astype(np.float64) exp = pd.SparseDataFrame({'A': SparseArray([0., 2., 0., 4.], fill_value=0.), 'B': SparseArray([0., 5., 0., 7.], fill_value=0.)}, default_fill_value=0.) tm.assert_sp_frame_equal(res, exp) assert res['A'].dtype == np.float64 assert res['B'].dtype == np.float64 def test_astype_bool(self): sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4], fill_value=0, dtype=np.int64), 'B': SparseArray([0, 5, 0, 7], fill_value=0, dtype=np.int64)}, default_fill_value=0) assert sparse['A'].dtype == np.int64 assert sparse['B'].dtype == np.int64 res = sparse.astype(bool) exp = pd.SparseDataFrame({'A': SparseArray([False, True, False, True], dtype=np.bool, fill_value=False), 'B': SparseArray([False, True, False, True], dtype=np.bool, fill_value=False)}, default_fill_value=False) tm.assert_sp_frame_equal(res, exp) assert res['A'].dtype == np.bool assert res['B'].dtype == np.bool def test_fillna(self): df = self.zframe.reindex(lrange(5)) dense = self.zorig.reindex(lrange(5)) result = df.fillna(0) expected = dense.fillna(0) tm.assert_sp_frame_equal(result, expected.to_sparse(fill_value=0), exact_indices=False) tm.assert_frame_equal(result.to_dense(), expected) result = df.copy() result.fillna(0, inplace=True) expected = dense.fillna(0) tm.assert_sp_frame_equal(result, expected.to_sparse(fill_value=0), exact_indices=False) tm.assert_frame_equal(result.to_dense(), expected) result = df.copy() result = df['A'] result.fillna(0, inplace=True) expected = dense['A'].fillna(0) # this changes internal SparseArray repr # tm.assert_sp_series_equal(result, expected.to_sparse(fill_value=0)) tm.assert_series_equal(result.to_dense(), expected) def test_fillna_fill_value(self): df = pd.DataFrame({'A': [1, 0, 0], 'B': [np.nan, np.nan, 4]}) sparse = pd.SparseDataFrame(df) tm.assert_frame_equal(sparse.fillna(-1).to_dense(), df.fillna(-1), check_dtype=False) sparse = pd.SparseDataFrame(df, default_fill_value=0) tm.assert_frame_equal(sparse.fillna(-1).to_dense(), df.fillna(-1), check_dtype=False) def test_sparse_frame_pad_backfill_limit(self): index = np.arange(10) df = DataFrame(np.random.randn(10, 4), index=index) sdf = df.to_sparse() result = sdf[:2].reindex(index, method='pad', limit=5) expected = sdf[:2].reindex(index).fillna(method='pad') expected = expected.to_dense() expected.values[-3:] = np.nan expected = expected.to_sparse() tm.assert_frame_equal(result, expected) result = sdf[-2:].reindex(index, method='backfill', limit=5) expected = sdf[-2:].reindex(index).fillna(method='backfill') expected = expected.to_dense() expected.values[:3] = np.nan expected = expected.to_sparse() tm.assert_frame_equal(result, expected) def test_sparse_frame_fillna_limit(self): index = np.arange(10) df = DataFrame(np.random.randn(10, 4), index=index) sdf = df.to_sparse() result = sdf[:2].reindex(index) result = result.fillna(method='pad', limit=5) expected = sdf[:2].reindex(index).fillna(method='pad') expected = expected.to_dense() expected.values[-3:] = np.nan expected = expected.to_sparse() tm.assert_frame_equal(result, expected) result = sdf[-2:].reindex(index) result = result.fillna(method='backfill', limit=5) expected = sdf[-2:].reindex(index).fillna(method='backfill') expected = expected.to_dense() expected.values[:3] = np.nan expected = expected.to_sparse() tm.assert_frame_equal(result, expected) def test_rename(self): result = self.frame.rename(index=str) expected = SparseDataFrame(self.data, index=self.dates.strftime( "%Y-%m-%d %H:%M:%S")) tm.assert_sp_frame_equal(result, expected) result = self.frame.rename(columns=lambda x: '%s%d' % (x, len(x))) data = {'A1': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B1': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], 'C1': np.arange(10, dtype=np.float64), 'D1': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} expected = SparseDataFrame(data, index=self.dates) tm.assert_sp_frame_equal(result, expected) def test_corr(self): res = self.frame.corr() tm.assert_frame_equal(res, self.frame.to_dense().corr()) def test_describe(self): self.frame['foo'] = np.nan self.frame.get_dtype_counts() str(self.frame) desc = self.frame.describe() # noqa def test_join(self): left = self.frame.loc[:, ['A', 'B']] right = self.frame.loc[:, ['C', 'D']] joined = left.join(right) tm.assert_sp_frame_equal(joined, self.frame, exact_indices=False) right = self.frame.loc[:, ['B', 'D']] pytest.raises(Exception, left.join, right) with tm.assert_raises_regex(ValueError, 'Other Series must have a name'): self.frame.join(Series( np.random.randn(len(self.frame)), index=self.frame.index)) def test_reindex(self): def _check_frame(frame): index = frame.index sidx = index[::2] sidx2 = index[:5] # noqa sparse_result = frame.reindex(sidx) dense_result = frame.to_dense().reindex(sidx) tm.assert_frame_equal(sparse_result.to_dense(), dense_result) tm.assert_frame_equal(frame.reindex(list(sidx)).to_dense(), dense_result) sparse_result2 = sparse_result.reindex(index) dense_result2 = dense_result.reindex(index) tm.assert_frame_equal(sparse_result2.to_dense(), dense_result2) # propagate CORRECT fill value tm.assert_almost_equal(sparse_result.default_fill_value, frame.default_fill_value) tm.assert_almost_equal(sparse_result['A'].fill_value, frame['A'].fill_value) # length zero length_zero = frame.reindex([]) assert len(length_zero) == 0 assert len(length_zero.columns) == len(frame.columns) assert len(length_zero['A']) == 0 # frame being reindexed has length zero length_n = length_zero.reindex(index) assert len(length_n) == len(frame) assert len(length_n.columns) == len(frame.columns) assert len(length_n['A']) == len(frame) # reindex columns reindexed = frame.reindex(columns=['A', 'B', 'Z']) assert len(reindexed.columns) == 3 tm.assert_almost_equal(reindexed['Z'].fill_value, frame.default_fill_value) assert np.isnan(reindexed['Z'].sp_values).all() _check_frame(self.frame) _check_frame(self.iframe) _check_frame(self.zframe) _check_frame(self.fill_frame) # with copy=False reindexed = self.frame.reindex(self.frame.index, copy=False) reindexed['F'] = reindexed['A'] assert 'F' in self.frame reindexed = self.frame.reindex(self.frame.index) reindexed['G'] = reindexed['A'] assert 'G' not in self.frame def test_reindex_fill_value(self): rng = bdate_range('20110110', periods=20) result = self.zframe.reindex(rng, fill_value=0) exp = self.zorig.reindex(rng, fill_value=0) exp = exp.to_sparse(self.zframe.default_fill_value) tm.assert_sp_frame_equal(result, exp) def test_reindex_method(self): sparse = SparseDataFrame(data=[[11., 12., 14.], [21., 22., 24.], [41., 42., 44.]], index=[1, 2, 4], columns=[1, 2, 4], dtype=float) # Over indices # default method result = sparse.reindex(index=range(6)) expected = SparseDataFrame(data=[[nan, nan, nan], [11., 12., 14.], [21., 22., 24.], [nan, nan, nan], [41., 42., 44.], [nan, nan, nan]], index=range(6), columns=[1, 2, 4], dtype=float) tm.assert_sp_frame_equal(result, expected) # method='bfill' result = sparse.reindex(index=range(6), method='bfill') expected = SparseDataFrame(data=[[11., 12., 14.], [11., 12., 14.], [21., 22., 24.], [41., 42., 44.], [41., 42., 44.], [nan, nan, nan]], index=range(6), columns=[1, 2, 4], dtype=float) tm.assert_sp_frame_equal(result, expected) # method='ffill' result = sparse.reindex(index=range(6), method='ffill') expected = SparseDataFrame(data=[[nan, nan, nan], [11., 12., 14.], [21., 22., 24.], [21., 22., 24.], [41., 42., 44.], [41., 42., 44.]], index=range(6), columns=[1, 2, 4], dtype=float) tm.assert_sp_frame_equal(result, expected) # Over columns # default method result = sparse.reindex(columns=range(6)) expected = SparseDataFrame(data=[[nan, 11., 12., nan, 14., nan], [nan, 21., 22., nan, 24., nan], [nan, 41., 42., nan, 44., nan]], index=[1, 2, 4], columns=range(6), dtype=float) tm.assert_sp_frame_equal(result, expected) # method='bfill' with pytest.raises(NotImplementedError): sparse.reindex(columns=range(6), method='bfill') # method='ffill' with pytest.raises(NotImplementedError): sparse.reindex(columns=range(6), method='ffill') def test_take(self): result = self.frame.take([1, 0, 2], axis=1) expected = self.frame.reindex(columns=['B', 'A', 'C']) tm.assert_sp_frame_equal(result, expected) def test_to_dense(self): def _check(frame, orig): dense_dm = frame.to_dense() tm.assert_frame_equal(frame, dense_dm) tm.assert_frame_equal(dense_dm, orig, check_dtype=False) self._check_all(_check) def test_stack_sparse_frame(self): with catch_warnings(record=True): def _check(frame): dense_frame = frame.to_dense() # noqa wp = Panel.from_dict({'foo': frame}) from_dense_lp = wp.to_frame() from_sparse_lp = spf.stack_sparse_frame(frame) tm.assert_numpy_array_equal(from_dense_lp.values, from_sparse_lp.values) _check(self.frame) _check(self.iframe) # for now pytest.raises(Exception, _check, self.zframe) pytest.raises(Exception, _check, self.fill_frame) def test_transpose(self): def _check(frame, orig): transposed = frame.T untransposed = transposed.T tm.assert_sp_frame_equal(frame, untransposed) tm.assert_frame_equal(frame.T.to_dense(), orig.T) tm.assert_frame_equal(frame.T.T.to_dense(), orig.T.T) tm.assert_sp_frame_equal(frame, frame.T.T, exact_indices=False) self._check_all(_check) def test_shift(self): def _check(frame, orig): shifted = frame.shift(0) exp = orig.shift(0) tm.assert_frame_equal(shifted.to_dense(), exp) shifted = frame.shift(1) exp = orig.shift(1) tm.assert_frame_equal(shifted, exp) shifted = frame.shift(-2) exp = orig.shift(-2) tm.assert_frame_equal(shifted, exp) shifted = frame.shift(2, freq='B') exp = orig.shift(2, freq='B') exp = exp.to_sparse(frame.default_fill_value, kind=frame.default_kind) tm.assert_frame_equal(shifted, exp) shifted = frame.shift(2, freq=BDay()) exp = orig.shift(2, freq=BDay()) exp = exp.to_sparse(frame.default_fill_value, kind=frame.default_kind) tm.assert_frame_equal(shifted, exp) self._check_all(_check) def test_count(self): dense_result = self.frame.to_dense().count() result = self.frame.count() tm.assert_series_equal(result, dense_result) result = self.frame.count(axis=None) tm.assert_series_equal(result, dense_result) result = self.frame.count(axis=0) tm.assert_series_equal(result, dense_result) result = self.frame.count(axis=1) dense_result = self.frame.to_dense().count(axis=1) # win32 don't check dtype tm.assert_series_equal(result, dense_result, check_dtype=False) def _check_all(self, check_func): check_func(self.frame, self.orig) check_func(self.iframe, self.iorig) check_func(self.zframe, self.zorig) check_func(self.fill_frame, self.fill_orig) def test_numpy_transpose(self): sdf = SparseDataFrame([1, 2, 3], index=[1, 2, 3], columns=['a']) result = np.transpose(np.transpose(sdf)) tm.assert_sp_frame_equal(result, sdf) msg = "the 'axes' parameter is not supported" tm.assert_raises_regex(ValueError, msg, np.transpose, sdf, axes=1) def test_combine_first(self): df = self.frame result = df[::2].combine_first(df) result2 = df[::2].combine_first(df.to_dense()) expected = df[::2].to_dense().combine_first(df.to_dense()) expected = expected.to_sparse(fill_value=df.default_fill_value) tm.assert_sp_frame_equal(result, result2) tm.assert_sp_frame_equal(result, expected) def test_combine_add(self): df = self.frame.to_dense() df2 = df.copy() df2['C'][:3] = np.nan df['A'][:3] = 5.7 result = df.to_sparse().add(df2.to_sparse(), fill_value=0) expected = df.add(df2, fill_value=0).to_sparse() tm.assert_sp_frame_equal(result, expected) def test_isin(self): sparse_df = DataFrame({'flag': [1., 0., 1.]}).to_sparse(fill_value=0.) xp = sparse_df[sparse_df.flag == 1.] rs = sparse_df[sparse_df.flag.isin([1.])] tm.assert_frame_equal(xp, rs) def test_sparse_pow_issue(self): # 2220 df = SparseDataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]}) # note : no error without nan df = SparseDataFrame({'A': [nan, 0, 1]}) # note that 2 ** df works fine, also df ** 1 result = 1 ** df r1 = result.take([0], 1)['A'] r2 = result['A'] assert len(r2.sp_values) == len(r1.sp_values) def test_as_blocks(self): df = SparseDataFrame({'A': [1.1, 3.3], 'B': [nan, -3.9]}, dtype='float64') # deprecated 0.21.0 with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): df_blocks = df.blocks assert list(df_blocks.keys()) == ['float64'] tm.assert_frame_equal(df_blocks['float64'], df) @pytest.mark.xfail(reason='nan column names in _init_dict problematic ' '(GH 16894)') def test_nan_columnname(self): # GH 8822 nan_colname = DataFrame(Series(1.0, index=[0]), columns=[nan]) nan_colname_sparse = nan_colname.to_sparse() assert np.isnan(nan_colname_sparse.columns[0]) def test_isna(self): # GH 8276 df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan], 'B': [0, np.nan, np.nan, 2, np.nan]}) res = df.isna() exp = pd.SparseDataFrame({'A': [True, True, False, False, True], 'B': [False, True, True, False, True]}, default_fill_value=True) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) # if fill_value is not nan, True can be included in sp_values df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan], 'B': [0, np.nan, 0, 2, np.nan]}, default_fill_value=0.) res = df.isna() assert isinstance(res, pd.SparseDataFrame) exp = pd.DataFrame({'A': [False, False, False, False, True], 'B': [False, True, False, False, True]}) tm.assert_frame_equal(res.to_dense(), exp) def test_notna(self): # GH 8276 df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan], 'B': [0, np.nan, np.nan, 2, np.nan]}) res = df.notna() exp = pd.SparseDataFrame({'A': [False, False, True, True, False], 'B': [True, False, False, True, False]}, default_fill_value=False) exp._default_fill_value = np.nan tm.assert_sp_frame_equal(res, exp) # if fill_value is not nan, True can be included in sp_values df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan], 'B': [0, np.nan, 0, 2, np.nan]}, default_fill_value=0.) res = df.notna() assert isinstance(res, pd.SparseDataFrame) exp = pd.DataFrame({'A': [True, True, True, True, False], 'B': [True, False, True, True, False]}) tm.assert_frame_equal(res.to_dense(), exp) class TestSparseDataFrameArithmetic(object): def test_numeric_op_scalar(self): df = pd.DataFrame({'A': [nan, nan, 0, 1, ], 'B': [0, 1, 2, nan], 'C': [1., 2., 3., 4.], 'D': [nan, nan, nan, nan]}) sparse = df.to_sparse() tm.assert_sp_frame_equal(sparse + 1, (df + 1).to_sparse()) def test_comparison_op_scalar(self): # GH 13001 df = pd.DataFrame({'A': [nan, nan, 0, 1, ], 'B': [0, 1, 2, nan], 'C': [1., 2., 3., 4.], 'D': [nan, nan, nan, nan]}) sparse = df.to_sparse() # comparison changes internal repr, compare with dense res = sparse > 1 assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), df > 1) res = sparse != 0 assert isinstance(res, pd.SparseDataFrame) tm.assert_frame_equal(res.to_dense(), df != 0) class TestSparseDataFrameAnalytics(object): def setup_method(self, method): self.data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], 'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], 'C': np.arange(10, dtype=float), 'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} self.dates = bdate_range('1/1/2011', periods=10) self.frame = SparseDataFrame(self.data, index=self.dates) def test_cumsum(self): expected = SparseDataFrame(self.frame.to_dense().cumsum()) result = self.frame.cumsum() tm.assert_sp_frame_equal(result, expected) result = self.frame.cumsum(axis=None) tm.assert_sp_frame_equal(result, expected) result = self.frame.cumsum(axis=0) tm.assert_sp_frame_equal(result, expected) def test_numpy_cumsum(self): result = np.cumsum(self.frame) expected = SparseDataFrame(self.frame.to_dense().cumsum()) tm.assert_sp_frame_equal(result, expected) msg = "the 'dtype' parameter is not supported" tm.assert_raises_regex(ValueError, msg, np.cumsum, self.frame, dtype=np.int64) msg = "the 'out' parameter is not supported" tm.assert_raises_regex(ValueError, msg, np.cumsum, self.frame, out=result) def test_numpy_func_call(self): # no exception should be raised even though # numpy passes in 'axis=None' or `axis=-1' funcs = ['sum', 'cumsum', 'var', 'mean', 'prod', 'cumprod', 'std', 'min', 'max'] for func in funcs: getattr(np, func)(self.frame) @pytest.mark.xfail(reason='Wrong SparseBlock initialization ' '(GH 17386)') def test_quantile(self): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [nan, nan]] q = 0.1 sparse_df = SparseDataFrame(data) result = sparse_df.quantile(q) dense_df = DataFrame(data) dense_expected = dense_df.quantile(q) sparse_expected = SparseSeries(dense_expected) tm.assert_series_equal(result, dense_expected) tm.assert_sp_series_equal(result, sparse_expected) @pytest.mark.xfail(reason='Wrong SparseBlock initialization ' '(GH 17386)') def test_quantile_multi(self): # GH 17386 data = [[1, 1], [2, 10], [3, 100], [nan, nan]] q = [0.1, 0.5] sparse_df = SparseDataFrame(data) result = sparse_df.quantile(q) dense_df = DataFrame(data) dense_expected = dense_df.quantile(q) sparse_expected = SparseDataFrame(dense_expected) tm.assert_frame_equal(result, dense_expected) tm.assert_sp_frame_equal(result, sparse_expected) def test_assign_with_sparse_frame(self): # GH 19163 df = pd.DataFrame({"a": [1, 2, 3]}) res = df.to_sparse(fill_value=False).assign(newcol=False) exp = df.assign(newcol=False).to_sparse(fill_value=False) tm.assert_sp_frame_equal(res, exp) for column in res.columns: assert type(res[column]) is SparseSeries