laywerrobot/lib/python3.6/site-packages/pandas/tests/sparse/frame/test_frame.py

1304 lines
48 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
# pylint: disable-msg=E1101,W0612
import operator
import pytest
from warnings import catch_warnings
from numpy import nan
import numpy as np
import pandas as pd
from pandas import Series, DataFrame, bdate_range, Panel
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.tseries.offsets import BDay
from pandas.util import testing as tm
from pandas.compat import lrange
from pandas import compat
from pandas.core.sparse import frame as spf
from pandas._libs.sparse import BlockIndex, IntIndex
from pandas.core.sparse.api import SparseSeries, SparseDataFrame, SparseArray
from pandas.tests.frame.test_api import SharedWithSparse
class TestSparseDataFrame(SharedWithSparse):
klass = SparseDataFrame
# SharedWithSparse tests use generic, klass-agnostic assertion
_assert_frame_equal = staticmethod(tm.assert_sp_frame_equal)
_assert_series_equal = staticmethod(tm.assert_sp_series_equal)
def setup_method(self, method):
self.data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
'C': np.arange(10, dtype=np.float64),
'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}
self.dates = bdate_range('1/1/2011', periods=10)
self.orig = pd.DataFrame(self.data, index=self.dates)
self.iorig = pd.DataFrame(self.data, index=self.dates)
self.frame = SparseDataFrame(self.data, index=self.dates)
self.iframe = SparseDataFrame(self.data, index=self.dates,
default_kind='integer')
self.mixed_frame = self.frame.copy(False)
self.mixed_frame['foo'] = pd.SparseArray(['bar'] * len(self.dates))
values = self.frame.values.copy()
values[np.isnan(values)] = 0
self.zorig = pd.DataFrame(values, columns=['A', 'B', 'C', 'D'],
index=self.dates)
self.zframe = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'],
default_fill_value=0, index=self.dates)
values = self.frame.values.copy()
values[np.isnan(values)] = 2
self.fill_orig = pd.DataFrame(values, columns=['A', 'B', 'C', 'D'],
index=self.dates)
self.fill_frame = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'],
default_fill_value=2,
index=self.dates)
self.empty = SparseDataFrame()
def test_fill_value_when_combine_const(self):
# GH12723
dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
df = SparseDataFrame({'foo': dat}, index=range(6))
exp = df.fillna(0).add(2)
res = df.add(2, fill_value=0)
tm.assert_sp_frame_equal(res, exp)
def test_values(self):
empty = self.empty.values
assert empty.shape == (0, 0)
no_cols = SparseDataFrame(index=np.arange(10))
mat = no_cols.values
assert mat.shape == (10, 0)
no_index = SparseDataFrame(columns=np.arange(10))
mat = no_index.values
assert mat.shape == (0, 10)
def test_copy(self):
cp = self.frame.copy()
assert isinstance(cp, SparseDataFrame)
tm.assert_sp_frame_equal(cp, self.frame)
# as of v0.15.0
# this is now identical (but not is_a )
assert cp.index.identical(self.frame.index)
def test_constructor(self):
for col, series in compat.iteritems(self.frame):
assert isinstance(series, SparseSeries)
assert isinstance(self.iframe['A'].sp_index, IntIndex)
# constructed zframe from matrix above
assert self.zframe['A'].fill_value == 0
tm.assert_numpy_array_equal(pd.SparseArray([1., 2., 3., 4., 5., 6.]),
self.zframe['A'].values)
tm.assert_numpy_array_equal(np.array([0., 0., 0., 0., 1., 2.,
3., 4., 5., 6.]),
self.zframe['A'].to_dense().values)
# construct no data
sdf = SparseDataFrame(columns=np.arange(10), index=np.arange(10))
for col, series in compat.iteritems(sdf):
assert isinstance(series, SparseSeries)
# construct from nested dict
data = {}
for c, s in compat.iteritems(self.frame):
data[c] = s.to_dict()
sdf = SparseDataFrame(data)
tm.assert_sp_frame_equal(sdf, self.frame)
# TODO: test data is copied from inputs
# init dict with different index
idx = self.frame.index[:5]
cons = SparseDataFrame(
self.frame, index=idx, columns=self.frame.columns,
default_fill_value=self.frame.default_fill_value,
default_kind=self.frame.default_kind, copy=True)
reindexed = self.frame.reindex(idx)
tm.assert_sp_frame_equal(cons, reindexed, exact_indices=False)
# assert level parameter breaks reindex
with pytest.raises(TypeError):
self.frame.reindex(idx, level=0)
repr(self.frame)
def test_constructor_dict_order(self):
# GH19018
# initialization ordering: by insertion order if python>= 3.6, else
# order by value
d = {'b': [2, 3], 'a': [0, 1]}
frame = SparseDataFrame(data=d)
if compat.PY36:
expected = SparseDataFrame(data=d, columns=list('ba'))
else:
expected = SparseDataFrame(data=d, columns=list('ab'))
tm.assert_sp_frame_equal(frame, expected)
def test_constructor_ndarray(self):
# no index or columns
sp = SparseDataFrame(self.frame.values)
# 1d
sp = SparseDataFrame(self.data['A'], index=self.dates, columns=['A'])
tm.assert_sp_frame_equal(sp, self.frame.reindex(columns=['A']))
# raise on level argument
pytest.raises(TypeError, self.frame.reindex, columns=['A'],
level=1)
# wrong length index / columns
with tm.assert_raises_regex(ValueError, "^Index length"):
SparseDataFrame(self.frame.values, index=self.frame.index[:-1])
with tm.assert_raises_regex(ValueError, "^Column length"):
SparseDataFrame(self.frame.values, columns=self.frame.columns[:-1])
# GH 9272
def test_constructor_empty(self):
sp = SparseDataFrame()
assert len(sp.index) == 0
assert len(sp.columns) == 0
def test_constructor_dataframe(self):
dense = self.frame.to_dense()
sp = SparseDataFrame(dense)
tm.assert_sp_frame_equal(sp, self.frame)
def test_constructor_convert_index_once(self):
arr = np.array([1.5, 2.5, 3.5])
sdf = SparseDataFrame(columns=lrange(4), index=arr)
assert sdf[0].index is sdf[1].index
def test_constructor_from_series(self):
# GH 2873
x = Series(np.random.randn(10000), name='a')
x = x.to_sparse(fill_value=0)
assert isinstance(x, SparseSeries)
df = SparseDataFrame(x)
assert isinstance(df, SparseDataFrame)
x = Series(np.random.randn(10000), name='a')
y = Series(np.random.randn(10000), name='b')
x2 = x.astype(float)
x2.loc[:9998] = np.NaN
# TODO: x_sparse is unused...fix
x_sparse = x2.to_sparse(fill_value=np.NaN) # noqa
# Currently fails too with weird ufunc error
# df1 = SparseDataFrame([x_sparse, y])
y.loc[:9998] = 0
# TODO: y_sparse is unsused...fix
y_sparse = y.to_sparse(fill_value=0) # noqa
# without sparse value raises error
# df2 = SparseDataFrame([x2_sparse, y])
def test_constructor_from_dense_series(self):
# GH 19393
# series with name
x = Series(np.random.randn(10000), name='a')
result = SparseDataFrame(x)
expected = x.to_frame().to_sparse()
tm.assert_sp_frame_equal(result, expected)
# series with no name
x = Series(np.random.randn(10000))
result = SparseDataFrame(x)
expected = x.to_frame().to_sparse()
tm.assert_sp_frame_equal(result, expected)
def test_constructor_from_unknown_type(self):
# GH 19393
class Unknown(object):
pass
with pytest.raises(TypeError,
message='SparseDataFrame called with unknown type '
'"Unknown" for data argument'):
SparseDataFrame(Unknown())
def test_constructor_preserve_attr(self):
# GH 13866
arr = pd.SparseArray([1, 0, 3, 0], dtype=np.int64, fill_value=0)
assert arr.dtype == np.int64
assert arr.fill_value == 0
df = pd.SparseDataFrame({'x': arr})
assert df['x'].dtype == np.int64
assert df['x'].fill_value == 0
s = pd.SparseSeries(arr, name='x')
assert s.dtype == np.int64
assert s.fill_value == 0
df = pd.SparseDataFrame(s)
assert df['x'].dtype == np.int64
assert df['x'].fill_value == 0
df = pd.SparseDataFrame({'x': s})
assert df['x'].dtype == np.int64
assert df['x'].fill_value == 0
def test_constructor_nan_dataframe(self):
# GH 10079
trains = np.arange(100)
thresholds = [10, 20, 30, 40, 50, 60]
tuples = [(i, j) for i in trains for j in thresholds]
index = pd.MultiIndex.from_tuples(tuples,
names=['trains', 'thresholds'])
matrix = np.empty((len(index), len(trains)))
matrix.fill(np.nan)
df = pd.DataFrame(matrix, index=index, columns=trains, dtype=float)
result = df.to_sparse()
expected = pd.SparseDataFrame(matrix, index=index, columns=trains,
dtype=float)
tm.assert_sp_frame_equal(result, expected)
def test_type_coercion_at_construction(self):
# GH 15682
result = pd.SparseDataFrame(
{'a': [1, 0, 0], 'b': [0, 1, 0], 'c': [0, 0, 1]}, dtype='uint8',
default_fill_value=0)
expected = pd.SparseDataFrame(
{'a': pd.SparseSeries([1, 0, 0], dtype='uint8'),
'b': pd.SparseSeries([0, 1, 0], dtype='uint8'),
'c': pd.SparseSeries([0, 0, 1], dtype='uint8')},
default_fill_value=0)
tm.assert_sp_frame_equal(result, expected)
def test_dtypes(self):
df = DataFrame(np.random.randn(10000, 4))
df.loc[:9998] = np.nan
sdf = df.to_sparse()
result = sdf.get_dtype_counts()
expected = Series({'float64': 4})
tm.assert_series_equal(result, expected)
def test_shape(self):
# see gh-10452
assert self.frame.shape == (10, 4)
assert self.iframe.shape == (10, 4)
assert self.zframe.shape == (10, 4)
assert self.fill_frame.shape == (10, 4)
def test_str(self):
df = DataFrame(np.random.randn(10000, 4))
df.loc[:9998] = np.nan
sdf = df.to_sparse()
str(sdf)
def test_array_interface(self):
res = np.sqrt(self.frame)
dres = np.sqrt(self.frame.to_dense())
tm.assert_frame_equal(res.to_dense(), dres)
def test_pickle(self):
def _test_roundtrip(frame, orig):
result = tm.round_trip_pickle(frame)
tm.assert_sp_frame_equal(frame, result)
tm.assert_frame_equal(result.to_dense(), orig, check_dtype=False)
_test_roundtrip(SparseDataFrame(), DataFrame())
self._check_all(_test_roundtrip)
def test_dense_to_sparse(self):
df = DataFrame({'A': [nan, nan, nan, 1, 2],
'B': [1, 2, nan, nan, nan]})
sdf = df.to_sparse()
assert isinstance(sdf, SparseDataFrame)
assert np.isnan(sdf.default_fill_value)
assert isinstance(sdf['A'].sp_index, BlockIndex)
tm.assert_frame_equal(sdf.to_dense(), df)
sdf = df.to_sparse(kind='integer')
assert isinstance(sdf['A'].sp_index, IntIndex)
df = DataFrame({'A': [0, 0, 0, 1, 2],
'B': [1, 2, 0, 0, 0]}, dtype=float)
sdf = df.to_sparse(fill_value=0)
assert sdf.default_fill_value == 0
tm.assert_frame_equal(sdf.to_dense(), df)
def test_density(self):
df = SparseSeries([nan, nan, nan, 0, 1, 2, 3, 4, 5, 6])
assert df.density == 0.7
df = SparseDataFrame({'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
'C': np.arange(10),
'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]})
assert df.density == 0.75
def test_sparse_to_dense(self):
pass
def test_sparse_series_ops(self):
self._check_frame_ops(self.frame)
def test_sparse_series_ops_i(self):
self._check_frame_ops(self.iframe)
def test_sparse_series_ops_z(self):
self._check_frame_ops(self.zframe)
def test_sparse_series_ops_fill(self):
self._check_frame_ops(self.fill_frame)
def _check_frame_ops(self, frame):
def _compare_to_dense(a, b, da, db, op):
sparse_result = op(a, b)
dense_result = op(da, db)
fill = sparse_result.default_fill_value
dense_result = dense_result.to_sparse(fill_value=fill)
tm.assert_sp_frame_equal(sparse_result, dense_result,
exact_indices=False)
if isinstance(a, DataFrame) and isinstance(db, DataFrame):
mixed_result = op(a, db)
assert isinstance(mixed_result, SparseDataFrame)
tm.assert_sp_frame_equal(mixed_result, sparse_result,
exact_indices=False)
opnames = ['add', 'sub', 'mul', 'truediv', 'floordiv']
ops = [getattr(operator, name) for name in opnames]
fidx = frame.index
# time series operations
series = [frame['A'], frame['B'], frame['C'], frame['D'],
frame['A'].reindex(fidx[:7]), frame['A'].reindex(fidx[::2]),
SparseSeries(
[], index=[])]
for op in opnames:
_compare_to_dense(frame, frame[::2], frame.to_dense(),
frame[::2].to_dense(), getattr(operator, op))
# 2304, no auto-broadcasting
for i, s in enumerate(series):
f = lambda a, b: getattr(a, op)(b, axis='index')
_compare_to_dense(frame, s, frame.to_dense(), s.to_dense(), f)
# rops are not implemented
# _compare_to_dense(s, frame, s.to_dense(),
# frame.to_dense(), f)
# cross-sectional operations
series = [frame.xs(fidx[0]), frame.xs(fidx[3]), frame.xs(fidx[5]),
frame.xs(fidx[7]), frame.xs(fidx[5])[:2]]
for op in ops:
for s in series:
_compare_to_dense(frame, s, frame.to_dense(), s, op)
_compare_to_dense(s, frame, s, frame.to_dense(), op)
# it works!
result = self.frame + self.frame.loc[:, ['A', 'B']] # noqa
def test_op_corners(self):
empty = self.empty + self.empty
assert empty.empty
foo = self.frame + self.empty
assert isinstance(foo.index, DatetimeIndex)
tm.assert_frame_equal(foo, self.frame * np.nan)
foo = self.empty + self.frame
tm.assert_frame_equal(foo, self.frame * np.nan)
def test_scalar_ops(self):
pass
def test_getitem(self):
# 1585 select multiple columns
sdf = SparseDataFrame(index=[0, 1, 2], columns=['a', 'b', 'c'])
result = sdf[['a', 'b']]
exp = sdf.reindex(columns=['a', 'b'])
tm.assert_sp_frame_equal(result, exp)
pytest.raises(Exception, sdf.__getitem__, ['a', 'd'])
def test_iloc(self):
# 2227
result = self.frame.iloc[:, 0]
assert isinstance(result, SparseSeries)
tm.assert_sp_series_equal(result, self.frame['A'])
# preserve sparse index type. #2251
data = {'A': [0, 1]}
iframe = SparseDataFrame(data, default_kind='integer')
tm.assert_class_equal(iframe['A'].sp_index,
iframe.iloc[:, 0].sp_index)
def test_set_value(self):
# ok, as the index gets converted to object
frame = self.frame.copy()
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
res = frame.set_value('foobar', 'B', 1.5)
assert res.index.dtype == 'object'
res = self.frame
res.index = res.index.astype(object)
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
res = self.frame.set_value('foobar', 'B', 1.5)
assert res is not self.frame
assert res.index[-1] == 'foobar'
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
assert res.get_value('foobar', 'B') == 1.5
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
res2 = res.set_value('foobar', 'qux', 1.5)
assert res2 is not res
tm.assert_index_equal(res2.columns,
pd.Index(list(self.frame.columns) + ['qux']))
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
assert res2.get_value('foobar', 'qux') == 1.5
def test_fancy_index_misc(self):
# axis = 0
sliced = self.frame.iloc[-2:, :]
expected = self.frame.reindex(index=self.frame.index[-2:])
tm.assert_sp_frame_equal(sliced, expected)
# axis = 1
sliced = self.frame.iloc[:, -2:]
expected = self.frame.reindex(columns=self.frame.columns[-2:])
tm.assert_sp_frame_equal(sliced, expected)
def test_getitem_overload(self):
# slicing
sl = self.frame[:20]
tm.assert_sp_frame_equal(sl, self.frame.reindex(self.frame.index[:20]))
# boolean indexing
d = self.frame.index[5]
indexer = self.frame.index > d
subindex = self.frame.index[indexer]
subframe = self.frame[indexer]
tm.assert_index_equal(subindex, subframe.index)
pytest.raises(Exception, self.frame.__getitem__, indexer[:-1])
def test_setitem(self):
def _check_frame(frame, orig):
N = len(frame)
# insert SparseSeries
frame['E'] = frame['A']
assert isinstance(frame['E'], SparseSeries)
tm.assert_sp_series_equal(frame['E'], frame['A'],
check_names=False)
# insert SparseSeries differently-indexed
to_insert = frame['A'][::2]
frame['E'] = to_insert
expected = to_insert.to_dense().reindex(frame.index)
result = frame['E'].to_dense()
tm.assert_series_equal(result, expected, check_names=False)
assert result.name == 'E'
# insert Series
frame['F'] = frame['A'].to_dense()
assert isinstance(frame['F'], SparseSeries)
tm.assert_sp_series_equal(frame['F'], frame['A'],
check_names=False)
# insert Series differently-indexed
to_insert = frame['A'].to_dense()[::2]
frame['G'] = to_insert
expected = to_insert.reindex(frame.index)
expected.name = 'G'
tm.assert_series_equal(frame['G'].to_dense(), expected)
# insert ndarray
frame['H'] = np.random.randn(N)
assert isinstance(frame['H'], SparseSeries)
to_sparsify = np.random.randn(N)
to_sparsify[N // 2:] = frame.default_fill_value
frame['I'] = to_sparsify
assert len(frame['I'].sp_values) == N // 2
# insert ndarray wrong size
pytest.raises(Exception, frame.__setitem__, 'foo',
np.random.randn(N - 1))
# scalar value
frame['J'] = 5
assert len(frame['J'].sp_values) == N
assert (frame['J'].sp_values == 5).all()
frame['K'] = frame.default_fill_value
assert len(frame['K'].sp_values) == 0
self._check_all(_check_frame)
def test_setitem_corner(self):
self.frame['a'] = self.frame['B']
tm.assert_sp_series_equal(self.frame['a'], self.frame['B'],
check_names=False)
def test_setitem_array(self):
arr = self.frame['B']
self.frame['E'] = arr
tm.assert_sp_series_equal(self.frame['E'], self.frame['B'],
check_names=False)
self.frame['F'] = arr[:-1]
index = self.frame.index[:-1]
tm.assert_sp_series_equal(self.frame['E'].reindex(index),
self.frame['F'].reindex(index),
check_names=False)
def test_setitem_chained_no_consolidate(self):
# https://github.com/pandas-dev/pandas/pull/19268
# issuecomment-361696418
# chained setitem used to cause consolidation
sdf = pd.SparseDataFrame([[np.nan, 1], [2, np.nan]])
with pd.option_context('mode.chained_assignment', None):
sdf[0][1] = 2
assert len(sdf._data.blocks) == 2
def test_delitem(self):
A = self.frame['A']
C = self.frame['C']
del self.frame['B']
assert 'B' not in self.frame
tm.assert_sp_series_equal(self.frame['A'], A)
tm.assert_sp_series_equal(self.frame['C'], C)
del self.frame['D']
assert 'D' not in self.frame
del self.frame['A']
assert 'A' not in self.frame
def test_set_columns(self):
self.frame.columns = self.frame.columns
pytest.raises(Exception, setattr, self.frame, 'columns',
self.frame.columns[:-1])
def test_set_index(self):
self.frame.index = self.frame.index
pytest.raises(Exception, setattr, self.frame, 'index',
self.frame.index[:-1])
def test_append(self):
a = self.frame[:5]
b = self.frame[5:]
appended = a.append(b)
tm.assert_sp_frame_equal(appended, self.frame, exact_indices=False)
a = self.frame.iloc[:5, :3]
b = self.frame.iloc[5:]
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
# Stacklevel is set for pd.concat, not append
appended = a.append(b)
tm.assert_sp_frame_equal(appended.iloc[:, :3], self.frame.iloc[:, :3],
exact_indices=False)
a = a[['B', 'C', 'A']].head(2)
b = b.head(2)
expected = pd.SparseDataFrame({
"B": [0., 1, None, 3],
"C": [0., 1, 5, 6],
"A": [None, None, 2, 3],
"D": [None, None, 5, None],
}, index=a.index | b.index, columns=['B', 'C', 'A', 'D'])
with tm.assert_produces_warning(None):
appended = a.append(b, sort=False)
tm.assert_frame_equal(appended, expected)
with tm.assert_produces_warning(None):
appended = a.append(b, sort=True)
tm.assert_sp_frame_equal(appended, expected[['A', 'B', 'C', 'D']])
def test_astype(self):
sparse = pd.SparseDataFrame({'A': SparseArray([1, 2, 3, 4],
dtype=np.int64),
'B': SparseArray([4, 5, 6, 7],
dtype=np.int64)})
assert sparse['A'].dtype == np.int64
assert sparse['B'].dtype == np.int64
res = sparse.astype(np.float64)
exp = pd.SparseDataFrame({'A': SparseArray([1., 2., 3., 4.],
fill_value=0.),
'B': SparseArray([4., 5., 6., 7.],
fill_value=0.)},
default_fill_value=np.nan)
tm.assert_sp_frame_equal(res, exp)
assert res['A'].dtype == np.float64
assert res['B'].dtype == np.float64
sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4],
dtype=np.int64),
'B': SparseArray([0, 5, 0, 7],
dtype=np.int64)},
default_fill_value=0)
assert sparse['A'].dtype == np.int64
assert sparse['B'].dtype == np.int64
res = sparse.astype(np.float64)
exp = pd.SparseDataFrame({'A': SparseArray([0., 2., 0., 4.],
fill_value=0.),
'B': SparseArray([0., 5., 0., 7.],
fill_value=0.)},
default_fill_value=0.)
tm.assert_sp_frame_equal(res, exp)
assert res['A'].dtype == np.float64
assert res['B'].dtype == np.float64
def test_astype_bool(self):
sparse = pd.SparseDataFrame({'A': SparseArray([0, 2, 0, 4],
fill_value=0,
dtype=np.int64),
'B': SparseArray([0, 5, 0, 7],
fill_value=0,
dtype=np.int64)},
default_fill_value=0)
assert sparse['A'].dtype == np.int64
assert sparse['B'].dtype == np.int64
res = sparse.astype(bool)
exp = pd.SparseDataFrame({'A': SparseArray([False, True, False, True],
dtype=np.bool,
fill_value=False),
'B': SparseArray([False, True, False, True],
dtype=np.bool,
fill_value=False)},
default_fill_value=False)
tm.assert_sp_frame_equal(res, exp)
assert res['A'].dtype == np.bool
assert res['B'].dtype == np.bool
def test_fillna(self):
df = self.zframe.reindex(lrange(5))
dense = self.zorig.reindex(lrange(5))
result = df.fillna(0)
expected = dense.fillna(0)
tm.assert_sp_frame_equal(result, expected.to_sparse(fill_value=0),
exact_indices=False)
tm.assert_frame_equal(result.to_dense(), expected)
result = df.copy()
result.fillna(0, inplace=True)
expected = dense.fillna(0)
tm.assert_sp_frame_equal(result, expected.to_sparse(fill_value=0),
exact_indices=False)
tm.assert_frame_equal(result.to_dense(), expected)
result = df.copy()
result = df['A']
result.fillna(0, inplace=True)
expected = dense['A'].fillna(0)
# this changes internal SparseArray repr
# tm.assert_sp_series_equal(result, expected.to_sparse(fill_value=0))
tm.assert_series_equal(result.to_dense(), expected)
def test_fillna_fill_value(self):
df = pd.DataFrame({'A': [1, 0, 0], 'B': [np.nan, np.nan, 4]})
sparse = pd.SparseDataFrame(df)
tm.assert_frame_equal(sparse.fillna(-1).to_dense(),
df.fillna(-1), check_dtype=False)
sparse = pd.SparseDataFrame(df, default_fill_value=0)
tm.assert_frame_equal(sparse.fillna(-1).to_dense(),
df.fillna(-1), check_dtype=False)
def test_sparse_frame_pad_backfill_limit(self):
index = np.arange(10)
df = DataFrame(np.random.randn(10, 4), index=index)
sdf = df.to_sparse()
result = sdf[:2].reindex(index, method='pad', limit=5)
expected = sdf[:2].reindex(index).fillna(method='pad')
expected = expected.to_dense()
expected.values[-3:] = np.nan
expected = expected.to_sparse()
tm.assert_frame_equal(result, expected)
result = sdf[-2:].reindex(index, method='backfill', limit=5)
expected = sdf[-2:].reindex(index).fillna(method='backfill')
expected = expected.to_dense()
expected.values[:3] = np.nan
expected = expected.to_sparse()
tm.assert_frame_equal(result, expected)
def test_sparse_frame_fillna_limit(self):
index = np.arange(10)
df = DataFrame(np.random.randn(10, 4), index=index)
sdf = df.to_sparse()
result = sdf[:2].reindex(index)
result = result.fillna(method='pad', limit=5)
expected = sdf[:2].reindex(index).fillna(method='pad')
expected = expected.to_dense()
expected.values[-3:] = np.nan
expected = expected.to_sparse()
tm.assert_frame_equal(result, expected)
result = sdf[-2:].reindex(index)
result = result.fillna(method='backfill', limit=5)
expected = sdf[-2:].reindex(index).fillna(method='backfill')
expected = expected.to_dense()
expected.values[:3] = np.nan
expected = expected.to_sparse()
tm.assert_frame_equal(result, expected)
def test_rename(self):
result = self.frame.rename(index=str)
expected = SparseDataFrame(self.data, index=self.dates.strftime(
"%Y-%m-%d %H:%M:%S"))
tm.assert_sp_frame_equal(result, expected)
result = self.frame.rename(columns=lambda x: '%s%d' % (x, len(x)))
data = {'A1': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
'B1': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
'C1': np.arange(10, dtype=np.float64),
'D1': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}
expected = SparseDataFrame(data, index=self.dates)
tm.assert_sp_frame_equal(result, expected)
def test_corr(self):
res = self.frame.corr()
tm.assert_frame_equal(res, self.frame.to_dense().corr())
def test_describe(self):
self.frame['foo'] = np.nan
self.frame.get_dtype_counts()
str(self.frame)
desc = self.frame.describe() # noqa
def test_join(self):
left = self.frame.loc[:, ['A', 'B']]
right = self.frame.loc[:, ['C', 'D']]
joined = left.join(right)
tm.assert_sp_frame_equal(joined, self.frame, exact_indices=False)
right = self.frame.loc[:, ['B', 'D']]
pytest.raises(Exception, left.join, right)
with tm.assert_raises_regex(ValueError,
'Other Series must have a name'):
self.frame.join(Series(
np.random.randn(len(self.frame)), index=self.frame.index))
def test_reindex(self):
def _check_frame(frame):
index = frame.index
sidx = index[::2]
sidx2 = index[:5] # noqa
sparse_result = frame.reindex(sidx)
dense_result = frame.to_dense().reindex(sidx)
tm.assert_frame_equal(sparse_result.to_dense(), dense_result)
tm.assert_frame_equal(frame.reindex(list(sidx)).to_dense(),
dense_result)
sparse_result2 = sparse_result.reindex(index)
dense_result2 = dense_result.reindex(index)
tm.assert_frame_equal(sparse_result2.to_dense(), dense_result2)
# propagate CORRECT fill value
tm.assert_almost_equal(sparse_result.default_fill_value,
frame.default_fill_value)
tm.assert_almost_equal(sparse_result['A'].fill_value,
frame['A'].fill_value)
# length zero
length_zero = frame.reindex([])
assert len(length_zero) == 0
assert len(length_zero.columns) == len(frame.columns)
assert len(length_zero['A']) == 0
# frame being reindexed has length zero
length_n = length_zero.reindex(index)
assert len(length_n) == len(frame)
assert len(length_n.columns) == len(frame.columns)
assert len(length_n['A']) == len(frame)
# reindex columns
reindexed = frame.reindex(columns=['A', 'B', 'Z'])
assert len(reindexed.columns) == 3
tm.assert_almost_equal(reindexed['Z'].fill_value,
frame.default_fill_value)
assert np.isnan(reindexed['Z'].sp_values).all()
_check_frame(self.frame)
_check_frame(self.iframe)
_check_frame(self.zframe)
_check_frame(self.fill_frame)
# with copy=False
reindexed = self.frame.reindex(self.frame.index, copy=False)
reindexed['F'] = reindexed['A']
assert 'F' in self.frame
reindexed = self.frame.reindex(self.frame.index)
reindexed['G'] = reindexed['A']
assert 'G' not in self.frame
def test_reindex_fill_value(self):
rng = bdate_range('20110110', periods=20)
result = self.zframe.reindex(rng, fill_value=0)
exp = self.zorig.reindex(rng, fill_value=0)
exp = exp.to_sparse(self.zframe.default_fill_value)
tm.assert_sp_frame_equal(result, exp)
def test_reindex_method(self):
sparse = SparseDataFrame(data=[[11., 12., 14.],
[21., 22., 24.],
[41., 42., 44.]],
index=[1, 2, 4],
columns=[1, 2, 4],
dtype=float)
# Over indices
# default method
result = sparse.reindex(index=range(6))
expected = SparseDataFrame(data=[[nan, nan, nan],
[11., 12., 14.],
[21., 22., 24.],
[nan, nan, nan],
[41., 42., 44.],
[nan, nan, nan]],
index=range(6),
columns=[1, 2, 4],
dtype=float)
tm.assert_sp_frame_equal(result, expected)
# method='bfill'
result = sparse.reindex(index=range(6), method='bfill')
expected = SparseDataFrame(data=[[11., 12., 14.],
[11., 12., 14.],
[21., 22., 24.],
[41., 42., 44.],
[41., 42., 44.],
[nan, nan, nan]],
index=range(6),
columns=[1, 2, 4],
dtype=float)
tm.assert_sp_frame_equal(result, expected)
# method='ffill'
result = sparse.reindex(index=range(6), method='ffill')
expected = SparseDataFrame(data=[[nan, nan, nan],
[11., 12., 14.],
[21., 22., 24.],
[21., 22., 24.],
[41., 42., 44.],
[41., 42., 44.]],
index=range(6),
columns=[1, 2, 4],
dtype=float)
tm.assert_sp_frame_equal(result, expected)
# Over columns
# default method
result = sparse.reindex(columns=range(6))
expected = SparseDataFrame(data=[[nan, 11., 12., nan, 14., nan],
[nan, 21., 22., nan, 24., nan],
[nan, 41., 42., nan, 44., nan]],
index=[1, 2, 4],
columns=range(6),
dtype=float)
tm.assert_sp_frame_equal(result, expected)
# method='bfill'
with pytest.raises(NotImplementedError):
sparse.reindex(columns=range(6), method='bfill')
# method='ffill'
with pytest.raises(NotImplementedError):
sparse.reindex(columns=range(6), method='ffill')
def test_take(self):
result = self.frame.take([1, 0, 2], axis=1)
expected = self.frame.reindex(columns=['B', 'A', 'C'])
tm.assert_sp_frame_equal(result, expected)
def test_to_dense(self):
def _check(frame, orig):
dense_dm = frame.to_dense()
tm.assert_frame_equal(frame, dense_dm)
tm.assert_frame_equal(dense_dm, orig, check_dtype=False)
self._check_all(_check)
def test_stack_sparse_frame(self):
with catch_warnings(record=True):
def _check(frame):
dense_frame = frame.to_dense() # noqa
wp = Panel.from_dict({'foo': frame})
from_dense_lp = wp.to_frame()
from_sparse_lp = spf.stack_sparse_frame(frame)
tm.assert_numpy_array_equal(from_dense_lp.values,
from_sparse_lp.values)
_check(self.frame)
_check(self.iframe)
# for now
pytest.raises(Exception, _check, self.zframe)
pytest.raises(Exception, _check, self.fill_frame)
def test_transpose(self):
def _check(frame, orig):
transposed = frame.T
untransposed = transposed.T
tm.assert_sp_frame_equal(frame, untransposed)
tm.assert_frame_equal(frame.T.to_dense(), orig.T)
tm.assert_frame_equal(frame.T.T.to_dense(), orig.T.T)
tm.assert_sp_frame_equal(frame, frame.T.T, exact_indices=False)
self._check_all(_check)
def test_shift(self):
def _check(frame, orig):
shifted = frame.shift(0)
exp = orig.shift(0)
tm.assert_frame_equal(shifted.to_dense(), exp)
shifted = frame.shift(1)
exp = orig.shift(1)
tm.assert_frame_equal(shifted, exp)
shifted = frame.shift(-2)
exp = orig.shift(-2)
tm.assert_frame_equal(shifted, exp)
shifted = frame.shift(2, freq='B')
exp = orig.shift(2, freq='B')
exp = exp.to_sparse(frame.default_fill_value,
kind=frame.default_kind)
tm.assert_frame_equal(shifted, exp)
shifted = frame.shift(2, freq=BDay())
exp = orig.shift(2, freq=BDay())
exp = exp.to_sparse(frame.default_fill_value,
kind=frame.default_kind)
tm.assert_frame_equal(shifted, exp)
self._check_all(_check)
def test_count(self):
dense_result = self.frame.to_dense().count()
result = self.frame.count()
tm.assert_series_equal(result, dense_result)
result = self.frame.count(axis=None)
tm.assert_series_equal(result, dense_result)
result = self.frame.count(axis=0)
tm.assert_series_equal(result, dense_result)
result = self.frame.count(axis=1)
dense_result = self.frame.to_dense().count(axis=1)
# win32 don't check dtype
tm.assert_series_equal(result, dense_result, check_dtype=False)
def _check_all(self, check_func):
check_func(self.frame, self.orig)
check_func(self.iframe, self.iorig)
check_func(self.zframe, self.zorig)
check_func(self.fill_frame, self.fill_orig)
def test_numpy_transpose(self):
sdf = SparseDataFrame([1, 2, 3], index=[1, 2, 3], columns=['a'])
result = np.transpose(np.transpose(sdf))
tm.assert_sp_frame_equal(result, sdf)
msg = "the 'axes' parameter is not supported"
tm.assert_raises_regex(ValueError, msg, np.transpose, sdf, axes=1)
def test_combine_first(self):
df = self.frame
result = df[::2].combine_first(df)
result2 = df[::2].combine_first(df.to_dense())
expected = df[::2].to_dense().combine_first(df.to_dense())
expected = expected.to_sparse(fill_value=df.default_fill_value)
tm.assert_sp_frame_equal(result, result2)
tm.assert_sp_frame_equal(result, expected)
def test_combine_add(self):
df = self.frame.to_dense()
df2 = df.copy()
df2['C'][:3] = np.nan
df['A'][:3] = 5.7
result = df.to_sparse().add(df2.to_sparse(), fill_value=0)
expected = df.add(df2, fill_value=0).to_sparse()
tm.assert_sp_frame_equal(result, expected)
def test_isin(self):
sparse_df = DataFrame({'flag': [1., 0., 1.]}).to_sparse(fill_value=0.)
xp = sparse_df[sparse_df.flag == 1.]
rs = sparse_df[sparse_df.flag.isin([1.])]
tm.assert_frame_equal(xp, rs)
def test_sparse_pow_issue(self):
# 2220
df = SparseDataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]})
# note : no error without nan
df = SparseDataFrame({'A': [nan, 0, 1]})
# note that 2 ** df works fine, also df ** 1
result = 1 ** df
r1 = result.take([0], 1)['A']
r2 = result['A']
assert len(r2.sp_values) == len(r1.sp_values)
def test_as_blocks(self):
df = SparseDataFrame({'A': [1.1, 3.3], 'B': [nan, -3.9]},
dtype='float64')
# deprecated 0.21.0
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
df_blocks = df.blocks
assert list(df_blocks.keys()) == ['float64']
tm.assert_frame_equal(df_blocks['float64'], df)
@pytest.mark.xfail(reason='nan column names in _init_dict problematic '
'(GH 16894)')
def test_nan_columnname(self):
# GH 8822
nan_colname = DataFrame(Series(1.0, index=[0]), columns=[nan])
nan_colname_sparse = nan_colname.to_sparse()
assert np.isnan(nan_colname_sparse.columns[0])
def test_isna(self):
# GH 8276
df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan],
'B': [0, np.nan, np.nan, 2, np.nan]})
res = df.isna()
exp = pd.SparseDataFrame({'A': [True, True, False, False, True],
'B': [False, True, True, False, True]},
default_fill_value=True)
exp._default_fill_value = np.nan
tm.assert_sp_frame_equal(res, exp)
# if fill_value is not nan, True can be included in sp_values
df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan],
'B': [0, np.nan, 0, 2, np.nan]},
default_fill_value=0.)
res = df.isna()
assert isinstance(res, pd.SparseDataFrame)
exp = pd.DataFrame({'A': [False, False, False, False, True],
'B': [False, True, False, False, True]})
tm.assert_frame_equal(res.to_dense(), exp)
def test_notna(self):
# GH 8276
df = pd.SparseDataFrame({'A': [np.nan, np.nan, 1, 2, np.nan],
'B': [0, np.nan, np.nan, 2, np.nan]})
res = df.notna()
exp = pd.SparseDataFrame({'A': [False, False, True, True, False],
'B': [True, False, False, True, False]},
default_fill_value=False)
exp._default_fill_value = np.nan
tm.assert_sp_frame_equal(res, exp)
# if fill_value is not nan, True can be included in sp_values
df = pd.SparseDataFrame({'A': [0, 0, 1, 2, np.nan],
'B': [0, np.nan, 0, 2, np.nan]},
default_fill_value=0.)
res = df.notna()
assert isinstance(res, pd.SparseDataFrame)
exp = pd.DataFrame({'A': [True, True, True, True, False],
'B': [True, False, True, True, False]})
tm.assert_frame_equal(res.to_dense(), exp)
class TestSparseDataFrameArithmetic(object):
def test_numeric_op_scalar(self):
df = pd.DataFrame({'A': [nan, nan, 0, 1, ],
'B': [0, 1, 2, nan],
'C': [1., 2., 3., 4.],
'D': [nan, nan, nan, nan]})
sparse = df.to_sparse()
tm.assert_sp_frame_equal(sparse + 1, (df + 1).to_sparse())
def test_comparison_op_scalar(self):
# GH 13001
df = pd.DataFrame({'A': [nan, nan, 0, 1, ],
'B': [0, 1, 2, nan],
'C': [1., 2., 3., 4.],
'D': [nan, nan, nan, nan]})
sparse = df.to_sparse()
# comparison changes internal repr, compare with dense
res = sparse > 1
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), df > 1)
res = sparse != 0
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), df != 0)
class TestSparseDataFrameAnalytics(object):
def setup_method(self, method):
self.data = {'A': [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6],
'B': [0, 1, 2, nan, nan, nan, 3, 4, 5, 6],
'C': np.arange(10, dtype=float),
'D': [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}
self.dates = bdate_range('1/1/2011', periods=10)
self.frame = SparseDataFrame(self.data, index=self.dates)
def test_cumsum(self):
expected = SparseDataFrame(self.frame.to_dense().cumsum())
result = self.frame.cumsum()
tm.assert_sp_frame_equal(result, expected)
result = self.frame.cumsum(axis=None)
tm.assert_sp_frame_equal(result, expected)
result = self.frame.cumsum(axis=0)
tm.assert_sp_frame_equal(result, expected)
def test_numpy_cumsum(self):
result = np.cumsum(self.frame)
expected = SparseDataFrame(self.frame.to_dense().cumsum())
tm.assert_sp_frame_equal(result, expected)
msg = "the 'dtype' parameter is not supported"
tm.assert_raises_regex(ValueError, msg, np.cumsum,
self.frame, dtype=np.int64)
msg = "the 'out' parameter is not supported"
tm.assert_raises_regex(ValueError, msg, np.cumsum,
self.frame, out=result)
def test_numpy_func_call(self):
# no exception should be raised even though
# numpy passes in 'axis=None' or `axis=-1'
funcs = ['sum', 'cumsum', 'var',
'mean', 'prod', 'cumprod',
'std', 'min', 'max']
for func in funcs:
getattr(np, func)(self.frame)
@pytest.mark.xfail(reason='Wrong SparseBlock initialization '
'(GH 17386)')
def test_quantile(self):
# GH 17386
data = [[1, 1], [2, 10], [3, 100], [nan, nan]]
q = 0.1
sparse_df = SparseDataFrame(data)
result = sparse_df.quantile(q)
dense_df = DataFrame(data)
dense_expected = dense_df.quantile(q)
sparse_expected = SparseSeries(dense_expected)
tm.assert_series_equal(result, dense_expected)
tm.assert_sp_series_equal(result, sparse_expected)
@pytest.mark.xfail(reason='Wrong SparseBlock initialization '
'(GH 17386)')
def test_quantile_multi(self):
# GH 17386
data = [[1, 1], [2, 10], [3, 100], [nan, nan]]
q = [0.1, 0.5]
sparse_df = SparseDataFrame(data)
result = sparse_df.quantile(q)
dense_df = DataFrame(data)
dense_expected = dense_df.quantile(q)
sparse_expected = SparseDataFrame(dense_expected)
tm.assert_frame_equal(result, dense_expected)
tm.assert_sp_frame_equal(result, sparse_expected)
def test_assign_with_sparse_frame(self):
# GH 19163
df = pd.DataFrame({"a": [1, 2, 3]})
res = df.to_sparse(fill_value=False).assign(newcol=False)
exp = df.assign(newcol=False).to_sparse(fill_value=False)
tm.assert_sp_frame_equal(res, exp)
for column in res.columns:
assert type(res[column]) is SparseSeries