847 lines
30 KiB
Python
847 lines
30 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
from __future__ import print_function
|
||
|
|
||
|
import pytest
|
||
|
|
||
|
from distutils.version import LooseVersion
|
||
|
from numpy import nan, random
|
||
|
import numpy as np
|
||
|
|
||
|
import datetime
|
||
|
import dateutil
|
||
|
|
||
|
from pandas.compat import lrange
|
||
|
from pandas import (DataFrame, Series, Timestamp,
|
||
|
date_range, Categorical)
|
||
|
import pandas as pd
|
||
|
|
||
|
from pandas.util.testing import assert_series_equal, assert_frame_equal
|
||
|
|
||
|
import pandas.util.testing as tm
|
||
|
import pandas.util._test_decorators as td
|
||
|
from pandas.tests.frame.common import TestData, _check_mixed_float
|
||
|
|
||
|
|
||
|
try:
|
||
|
import scipy
|
||
|
_is_scipy_ge_0190 = (LooseVersion(scipy.__version__) >=
|
||
|
LooseVersion('0.19.0'))
|
||
|
except:
|
||
|
_is_scipy_ge_0190 = False
|
||
|
|
||
|
|
||
|
def _skip_if_no_pchip():
|
||
|
try:
|
||
|
from scipy.interpolate import pchip_interpolate # noqa
|
||
|
except ImportError:
|
||
|
import pytest
|
||
|
pytest.skip('scipy.interpolate.pchip missing')
|
||
|
|
||
|
|
||
|
class TestDataFrameMissingData(TestData):
|
||
|
|
||
|
def test_dropEmptyRows(self):
|
||
|
N = len(self.frame.index)
|
||
|
mat = random.randn(N)
|
||
|
mat[:5] = nan
|
||
|
|
||
|
frame = DataFrame({'foo': mat}, index=self.frame.index)
|
||
|
original = Series(mat, index=self.frame.index, name='foo')
|
||
|
expected = original.dropna()
|
||
|
inplace_frame1, inplace_frame2 = frame.copy(), frame.copy()
|
||
|
|
||
|
smaller_frame = frame.dropna(how='all')
|
||
|
# check that original was preserved
|
||
|
assert_series_equal(frame['foo'], original)
|
||
|
inplace_frame1.dropna(how='all', inplace=True)
|
||
|
assert_series_equal(smaller_frame['foo'], expected)
|
||
|
assert_series_equal(inplace_frame1['foo'], expected)
|
||
|
|
||
|
smaller_frame = frame.dropna(how='all', subset=['foo'])
|
||
|
inplace_frame2.dropna(how='all', subset=['foo'], inplace=True)
|
||
|
assert_series_equal(smaller_frame['foo'], expected)
|
||
|
assert_series_equal(inplace_frame2['foo'], expected)
|
||
|
|
||
|
def test_dropIncompleteRows(self):
|
||
|
N = len(self.frame.index)
|
||
|
mat = random.randn(N)
|
||
|
mat[:5] = nan
|
||
|
|
||
|
frame = DataFrame({'foo': mat}, index=self.frame.index)
|
||
|
frame['bar'] = 5
|
||
|
original = Series(mat, index=self.frame.index, name='foo')
|
||
|
inp_frame1, inp_frame2 = frame.copy(), frame.copy()
|
||
|
|
||
|
smaller_frame = frame.dropna()
|
||
|
assert_series_equal(frame['foo'], original)
|
||
|
inp_frame1.dropna(inplace=True)
|
||
|
|
||
|
exp = Series(mat[5:], index=self.frame.index[5:], name='foo')
|
||
|
tm.assert_series_equal(smaller_frame['foo'], exp)
|
||
|
tm.assert_series_equal(inp_frame1['foo'], exp)
|
||
|
|
||
|
samesize_frame = frame.dropna(subset=['bar'])
|
||
|
assert_series_equal(frame['foo'], original)
|
||
|
assert (frame['bar'] == 5).all()
|
||
|
inp_frame2.dropna(subset=['bar'], inplace=True)
|
||
|
tm.assert_index_equal(samesize_frame.index, self.frame.index)
|
||
|
tm.assert_index_equal(inp_frame2.index, self.frame.index)
|
||
|
|
||
|
def test_dropna(self):
|
||
|
df = DataFrame(np.random.randn(6, 4))
|
||
|
df[2][:2] = nan
|
||
|
|
||
|
dropped = df.dropna(axis=1)
|
||
|
expected = df.loc[:, [0, 1, 3]]
|
||
|
inp = df.copy()
|
||
|
inp.dropna(axis=1, inplace=True)
|
||
|
assert_frame_equal(dropped, expected)
|
||
|
assert_frame_equal(inp, expected)
|
||
|
|
||
|
dropped = df.dropna(axis=0)
|
||
|
expected = df.loc[lrange(2, 6)]
|
||
|
inp = df.copy()
|
||
|
inp.dropna(axis=0, inplace=True)
|
||
|
assert_frame_equal(dropped, expected)
|
||
|
assert_frame_equal(inp, expected)
|
||
|
|
||
|
# threshold
|
||
|
dropped = df.dropna(axis=1, thresh=5)
|
||
|
expected = df.loc[:, [0, 1, 3]]
|
||
|
inp = df.copy()
|
||
|
inp.dropna(axis=1, thresh=5, inplace=True)
|
||
|
assert_frame_equal(dropped, expected)
|
||
|
assert_frame_equal(inp, expected)
|
||
|
|
||
|
dropped = df.dropna(axis=0, thresh=4)
|
||
|
expected = df.loc[lrange(2, 6)]
|
||
|
inp = df.copy()
|
||
|
inp.dropna(axis=0, thresh=4, inplace=True)
|
||
|
assert_frame_equal(dropped, expected)
|
||
|
assert_frame_equal(inp, expected)
|
||
|
|
||
|
dropped = df.dropna(axis=1, thresh=4)
|
||
|
assert_frame_equal(dropped, df)
|
||
|
|
||
|
dropped = df.dropna(axis=1, thresh=3)
|
||
|
assert_frame_equal(dropped, df)
|
||
|
|
||
|
# subset
|
||
|
dropped = df.dropna(axis=0, subset=[0, 1, 3])
|
||
|
inp = df.copy()
|
||
|
inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
|
||
|
assert_frame_equal(dropped, df)
|
||
|
assert_frame_equal(inp, df)
|
||
|
|
||
|
# all
|
||
|
dropped = df.dropna(axis=1, how='all')
|
||
|
assert_frame_equal(dropped, df)
|
||
|
|
||
|
df[2] = nan
|
||
|
dropped = df.dropna(axis=1, how='all')
|
||
|
expected = df.loc[:, [0, 1, 3]]
|
||
|
assert_frame_equal(dropped, expected)
|
||
|
|
||
|
# bad input
|
||
|
pytest.raises(ValueError, df.dropna, axis=3)
|
||
|
|
||
|
def test_drop_and_dropna_caching(self):
|
||
|
# tst that cacher updates
|
||
|
original = Series([1, 2, np.nan], name='A')
|
||
|
expected = Series([1, 2], dtype=original.dtype, name='A')
|
||
|
df = pd.DataFrame({'A': original.values.copy()})
|
||
|
df2 = df.copy()
|
||
|
df['A'].dropna()
|
||
|
assert_series_equal(df['A'], original)
|
||
|
df['A'].dropna(inplace=True)
|
||
|
assert_series_equal(df['A'], expected)
|
||
|
df2['A'].drop([1])
|
||
|
assert_series_equal(df2['A'], original)
|
||
|
df2['A'].drop([1], inplace=True)
|
||
|
assert_series_equal(df2['A'], original.drop([1]))
|
||
|
|
||
|
def test_dropna_corner(self):
|
||
|
# bad input
|
||
|
pytest.raises(ValueError, self.frame.dropna, how='foo')
|
||
|
pytest.raises(TypeError, self.frame.dropna, how=None)
|
||
|
# non-existent column - 8303
|
||
|
pytest.raises(KeyError, self.frame.dropna, subset=['A', 'X'])
|
||
|
|
||
|
def test_dropna_multiple_axes(self):
|
||
|
df = DataFrame([[1, np.nan, 2, 3],
|
||
|
[4, np.nan, 5, 6],
|
||
|
[np.nan, np.nan, np.nan, np.nan],
|
||
|
[7, np.nan, 8, 9]])
|
||
|
cp = df.copy()
|
||
|
|
||
|
# GH20987
|
||
|
with tm.assert_produces_warning(FutureWarning):
|
||
|
result = df.dropna(how='all', axis=[0, 1])
|
||
|
with tm.assert_produces_warning(FutureWarning):
|
||
|
result2 = df.dropna(how='all', axis=(0, 1))
|
||
|
expected = df.dropna(how='all').dropna(how='all', axis=1)
|
||
|
|
||
|
assert_frame_equal(result, expected)
|
||
|
assert_frame_equal(result2, expected)
|
||
|
assert_frame_equal(df, cp)
|
||
|
|
||
|
inp = df.copy()
|
||
|
with tm.assert_produces_warning(FutureWarning):
|
||
|
inp.dropna(how='all', axis=(0, 1), inplace=True)
|
||
|
assert_frame_equal(inp, expected)
|
||
|
|
||
|
def test_dropna_tz_aware_datetime(self):
|
||
|
# GH13407
|
||
|
df = DataFrame()
|
||
|
dt1 = datetime.datetime(2015, 1, 1,
|
||
|
tzinfo=dateutil.tz.tzutc())
|
||
|
dt2 = datetime.datetime(2015, 2, 2,
|
||
|
tzinfo=dateutil.tz.tzutc())
|
||
|
df['Time'] = [dt1]
|
||
|
result = df.dropna(axis=0)
|
||
|
expected = DataFrame({'Time': [dt1]})
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# Ex2
|
||
|
df = DataFrame({'Time': [dt1, None, np.nan, dt2]})
|
||
|
result = df.dropna(axis=0)
|
||
|
expected = DataFrame([dt1, dt2],
|
||
|
columns=['Time'],
|
||
|
index=[0, 3])
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna(self):
|
||
|
tf = self.tsframe
|
||
|
tf.loc[tf.index[:5], 'A'] = nan
|
||
|
tf.loc[tf.index[-5:], 'A'] = nan
|
||
|
|
||
|
zero_filled = self.tsframe.fillna(0)
|
||
|
assert (zero_filled.loc[zero_filled.index[:5], 'A'] == 0).all()
|
||
|
|
||
|
padded = self.tsframe.fillna(method='pad')
|
||
|
assert np.isnan(padded.loc[padded.index[:5], 'A']).all()
|
||
|
assert (padded.loc[padded.index[-5:], 'A'] ==
|
||
|
padded.loc[padded.index[-5], 'A']).all()
|
||
|
|
||
|
# mixed type
|
||
|
mf = self.mixed_frame
|
||
|
mf.loc[mf.index[5:20], 'foo'] = nan
|
||
|
mf.loc[mf.index[-10:], 'A'] = nan
|
||
|
result = self.mixed_frame.fillna(value=0)
|
||
|
result = self.mixed_frame.fillna(method='pad')
|
||
|
|
||
|
pytest.raises(ValueError, self.tsframe.fillna)
|
||
|
pytest.raises(ValueError, self.tsframe.fillna, 5, method='ffill')
|
||
|
|
||
|
# mixed numeric (but no float16)
|
||
|
mf = self.mixed_float.reindex(columns=['A', 'B', 'D'])
|
||
|
mf.loc[mf.index[-10:], 'A'] = nan
|
||
|
result = mf.fillna(value=0)
|
||
|
_check_mixed_float(result, dtype=dict(C=None))
|
||
|
|
||
|
result = mf.fillna(method='pad')
|
||
|
_check_mixed_float(result, dtype=dict(C=None))
|
||
|
|
||
|
# empty frame (GH #2778)
|
||
|
df = DataFrame(columns=['x'])
|
||
|
for m in ['pad', 'backfill']:
|
||
|
df.x.fillna(method=m, inplace=True)
|
||
|
df.x.fillna(method=m)
|
||
|
|
||
|
# with different dtype (GH3386)
|
||
|
df = DataFrame([['a', 'a', np.nan, 'a'], [
|
||
|
'b', 'b', np.nan, 'b'], ['c', 'c', np.nan, 'c']])
|
||
|
|
||
|
result = df.fillna({2: 'foo'})
|
||
|
expected = DataFrame([['a', 'a', 'foo', 'a'],
|
||
|
['b', 'b', 'foo', 'b'],
|
||
|
['c', 'c', 'foo', 'c']])
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
df.fillna({2: 'foo'}, inplace=True)
|
||
|
assert_frame_equal(df, expected)
|
||
|
|
||
|
# limit and value
|
||
|
df = DataFrame(np.random.randn(10, 3))
|
||
|
df.iloc[2:7, 0] = np.nan
|
||
|
df.iloc[3:5, 2] = np.nan
|
||
|
|
||
|
expected = df.copy()
|
||
|
expected.iloc[2, 0] = 999
|
||
|
expected.iloc[3, 2] = 999
|
||
|
result = df.fillna(999, limit=1)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# with datelike
|
||
|
# GH 6344
|
||
|
df = DataFrame({
|
||
|
'Date': [pd.NaT, Timestamp("2014-1-1")],
|
||
|
'Date2': [Timestamp("2013-1-1"), pd.NaT]
|
||
|
})
|
||
|
|
||
|
expected = df.copy()
|
||
|
expected['Date'] = expected['Date'].fillna(
|
||
|
df.loc[df.index[0], 'Date2'])
|
||
|
result = df.fillna(value={'Date': df['Date2']})
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# with timezone
|
||
|
# GH 15855
|
||
|
df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
|
||
|
pd.NaT]})
|
||
|
exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
|
||
|
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
|
||
|
assert_frame_equal(df.fillna(method='pad'), exp)
|
||
|
|
||
|
df = pd.DataFrame({'A': [pd.NaT,
|
||
|
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
|
||
|
exp = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'),
|
||
|
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
|
||
|
assert_frame_equal(df.fillna(method='bfill'), exp)
|
||
|
|
||
|
def test_na_actions_categorical(self):
|
||
|
|
||
|
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
|
||
|
vals = ["a", "b", np.nan, "d"]
|
||
|
df = DataFrame({"cats": cat, "vals": vals})
|
||
|
cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
|
||
|
vals2 = ["a", "b", "b", "d"]
|
||
|
df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
|
||
|
cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
|
||
|
vals3 = ["a", "b", np.nan]
|
||
|
df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
|
||
|
cat4 = Categorical([1, 2], categories=[1, 2, 3])
|
||
|
vals4 = ["a", "b"]
|
||
|
df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
|
||
|
|
||
|
# fillna
|
||
|
res = df.fillna(value={"cats": 3, "vals": "b"})
|
||
|
tm.assert_frame_equal(res, df_exp_fill)
|
||
|
|
||
|
with tm.assert_raises_regex(ValueError, "fill value must be "
|
||
|
"in categories"):
|
||
|
df.fillna(value={"cats": 4, "vals": "c"})
|
||
|
|
||
|
res = df.fillna(method='pad')
|
||
|
tm.assert_frame_equal(res, df_exp_fill)
|
||
|
|
||
|
# dropna
|
||
|
res = df.dropna(subset=["cats"])
|
||
|
tm.assert_frame_equal(res, df_exp_drop_cats)
|
||
|
|
||
|
res = df.dropna()
|
||
|
tm.assert_frame_equal(res, df_exp_drop_all)
|
||
|
|
||
|
# make sure that fillna takes missing values into account
|
||
|
c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
|
||
|
df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})
|
||
|
|
||
|
cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
|
||
|
df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
|
||
|
|
||
|
res = df.fillna("a")
|
||
|
tm.assert_frame_equal(res, df_exp)
|
||
|
|
||
|
def test_fillna_categorical_nan(self):
|
||
|
# GH 14021
|
||
|
# np.nan should always be a valid filler
|
||
|
cat = Categorical([np.nan, 2, np.nan])
|
||
|
val = Categorical([np.nan, np.nan, np.nan])
|
||
|
df = DataFrame({"cats": cat, "vals": val})
|
||
|
res = df.fillna(df.median())
|
||
|
v_exp = [np.nan, np.nan, np.nan]
|
||
|
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
|
||
|
dtype='category')
|
||
|
tm.assert_frame_equal(res, df_exp)
|
||
|
|
||
|
result = df.cats.fillna(np.nan)
|
||
|
tm.assert_series_equal(result, df.cats)
|
||
|
result = df.vals.fillna(np.nan)
|
||
|
tm.assert_series_equal(result, df.vals)
|
||
|
|
||
|
idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
|
||
|
'2011-01-01 09:00', pd.NaT, pd.NaT])
|
||
|
df = DataFrame({'a': Categorical(idx)})
|
||
|
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
|
||
|
|
||
|
idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
|
||
|
pd.NaT, pd.NaT], freq='M')
|
||
|
df = DataFrame({'a': Categorical(idx)})
|
||
|
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
|
||
|
|
||
|
idx = pd.TimedeltaIndex(['1 days', '2 days',
|
||
|
'1 days', pd.NaT, pd.NaT])
|
||
|
df = DataFrame({'a': Categorical(idx)})
|
||
|
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
|
||
|
|
||
|
def test_fillna_downcast(self):
|
||
|
# GH 15277
|
||
|
# infer int64 from float64
|
||
|
df = pd.DataFrame({'a': [1., np.nan]})
|
||
|
result = df.fillna(0, downcast='infer')
|
||
|
expected = pd.DataFrame({'a': [1, 0]})
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# infer int64 from float64 when fillna value is a dict
|
||
|
df = pd.DataFrame({'a': [1., np.nan]})
|
||
|
result = df.fillna({'a': 0}, downcast='infer')
|
||
|
expected = pd.DataFrame({'a': [1, 0]})
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna_dtype_conversion(self):
|
||
|
# make sure that fillna on an empty frame works
|
||
|
df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
|
||
|
result = df.get_dtype_counts().sort_values()
|
||
|
expected = Series({'object': 5})
|
||
|
assert_series_equal(result, expected)
|
||
|
|
||
|
result = df.fillna(1)
|
||
|
expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
|
||
|
result = result.get_dtype_counts().sort_values()
|
||
|
expected = Series({'int64': 5})
|
||
|
assert_series_equal(result, expected)
|
||
|
|
||
|
# empty block
|
||
|
df = DataFrame(index=lrange(3), columns=['A', 'B'], dtype='float64')
|
||
|
result = df.fillna('nan')
|
||
|
expected = DataFrame('nan', index=lrange(3), columns=['A', 'B'])
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# equiv of replace
|
||
|
df = DataFrame(dict(A=[1, np.nan], B=[1., 2.]))
|
||
|
for v in ['', 1, np.nan, 1.0]:
|
||
|
expected = df.replace(np.nan, v)
|
||
|
result = df.fillna(v)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna_datetime_columns(self):
|
||
|
# GH 7095
|
||
|
df = pd.DataFrame({'A': [-1, -2, np.nan],
|
||
|
'B': date_range('20130101', periods=3),
|
||
|
'C': ['foo', 'bar', None],
|
||
|
'D': ['foo2', 'bar2', None]},
|
||
|
index=date_range('20130110', periods=3))
|
||
|
result = df.fillna('?')
|
||
|
expected = pd.DataFrame({'A': [-1, -2, '?'],
|
||
|
'B': date_range('20130101', periods=3),
|
||
|
'C': ['foo', 'bar', '?'],
|
||
|
'D': ['foo2', 'bar2', '?']},
|
||
|
index=date_range('20130110', periods=3))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
df = pd.DataFrame({'A': [-1, -2, np.nan],
|
||
|
'B': [pd.Timestamp('2013-01-01'),
|
||
|
pd.Timestamp('2013-01-02'), pd.NaT],
|
||
|
'C': ['foo', 'bar', None],
|
||
|
'D': ['foo2', 'bar2', None]},
|
||
|
index=date_range('20130110', periods=3))
|
||
|
result = df.fillna('?')
|
||
|
expected = pd.DataFrame({'A': [-1, -2, '?'],
|
||
|
'B': [pd.Timestamp('2013-01-01'),
|
||
|
pd.Timestamp('2013-01-02'), '?'],
|
||
|
'C': ['foo', 'bar', '?'],
|
||
|
'D': ['foo2', 'bar2', '?']},
|
||
|
index=pd.date_range('20130110', periods=3))
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_ffill(self):
|
||
|
self.tsframe['A'][:5] = nan
|
||
|
self.tsframe['A'][-5:] = nan
|
||
|
|
||
|
assert_frame_equal(self.tsframe.ffill(),
|
||
|
self.tsframe.fillna(method='ffill'))
|
||
|
|
||
|
def test_bfill(self):
|
||
|
self.tsframe['A'][:5] = nan
|
||
|
self.tsframe['A'][-5:] = nan
|
||
|
|
||
|
assert_frame_equal(self.tsframe.bfill(),
|
||
|
self.tsframe.fillna(method='bfill'))
|
||
|
|
||
|
def test_frame_pad_backfill_limit(self):
|
||
|
index = np.arange(10)
|
||
|
df = DataFrame(np.random.randn(10, 4), index=index)
|
||
|
|
||
|
result = df[:2].reindex(index, method='pad', limit=5)
|
||
|
|
||
|
expected = df[:2].reindex(index).fillna(method='pad')
|
||
|
expected.values[-3:] = np.nan
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df[-2:].reindex(index, method='backfill', limit=5)
|
||
|
|
||
|
expected = df[-2:].reindex(index).fillna(method='backfill')
|
||
|
expected.values[:3] = np.nan
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_frame_fillna_limit(self):
|
||
|
index = np.arange(10)
|
||
|
df = DataFrame(np.random.randn(10, 4), index=index)
|
||
|
|
||
|
result = df[:2].reindex(index)
|
||
|
result = result.fillna(method='pad', limit=5)
|
||
|
|
||
|
expected = df[:2].reindex(index).fillna(method='pad')
|
||
|
expected.values[-3:] = np.nan
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df[-2:].reindex(index)
|
||
|
result = result.fillna(method='backfill', limit=5)
|
||
|
|
||
|
expected = df[-2:].reindex(index).fillna(method='backfill')
|
||
|
expected.values[:3] = np.nan
|
||
|
tm.assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna_skip_certain_blocks(self):
|
||
|
# don't try to fill boolean, int blocks
|
||
|
|
||
|
df = DataFrame(np.random.randn(10, 4).astype(int))
|
||
|
|
||
|
# it works!
|
||
|
df.fillna(np.nan)
|
||
|
|
||
|
def test_fillna_inplace(self):
|
||
|
df = DataFrame(np.random.randn(10, 4))
|
||
|
df[1][:4] = np.nan
|
||
|
df[3][-4:] = np.nan
|
||
|
|
||
|
expected = df.fillna(value=0)
|
||
|
assert expected is not df
|
||
|
|
||
|
df.fillna(value=0, inplace=True)
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
expected = df.fillna(value={0: 0}, inplace=True)
|
||
|
assert expected is None
|
||
|
|
||
|
df[1][:4] = np.nan
|
||
|
df[3][-4:] = np.nan
|
||
|
expected = df.fillna(method='ffill')
|
||
|
assert expected is not df
|
||
|
|
||
|
df.fillna(method='ffill', inplace=True)
|
||
|
tm.assert_frame_equal(df, expected)
|
||
|
|
||
|
def test_fillna_dict_series(self):
|
||
|
df = DataFrame({'a': [nan, 1, 2, nan, nan],
|
||
|
'b': [1, 2, 3, nan, nan],
|
||
|
'c': [nan, 1, 2, 3, 4]})
|
||
|
|
||
|
result = df.fillna({'a': 0, 'b': 5})
|
||
|
|
||
|
expected = df.copy()
|
||
|
expected['a'] = expected['a'].fillna(0)
|
||
|
expected['b'] = expected['b'].fillna(5)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# it works
|
||
|
result = df.fillna({'a': 0, 'b': 5, 'd': 7})
|
||
|
|
||
|
# Series treated same as dict
|
||
|
result = df.fillna(df.max())
|
||
|
expected = df.fillna(df.max().to_dict())
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# disable this for now
|
||
|
with tm.assert_raises_regex(NotImplementedError,
|
||
|
'column by column'):
|
||
|
df.fillna(df.max(1), axis=1)
|
||
|
|
||
|
def test_fillna_dataframe(self):
|
||
|
# GH 8377
|
||
|
df = DataFrame({'a': [nan, 1, 2, nan, nan],
|
||
|
'b': [1, 2, 3, nan, nan],
|
||
|
'c': [nan, 1, 2, 3, 4]},
|
||
|
index=list('VWXYZ'))
|
||
|
|
||
|
# df2 may have different index and columns
|
||
|
df2 = DataFrame({'a': [nan, 10, 20, 30, 40],
|
||
|
'b': [50, 60, 70, 80, 90],
|
||
|
'foo': ['bar'] * 5},
|
||
|
index=list('VWXuZ'))
|
||
|
|
||
|
result = df.fillna(df2)
|
||
|
|
||
|
# only those columns and indices which are shared get filled
|
||
|
expected = DataFrame({'a': [nan, 1, 2, nan, 40],
|
||
|
'b': [1, 2, 3, nan, 90],
|
||
|
'c': [nan, 1, 2, 3, 4]},
|
||
|
index=list('VWXYZ'))
|
||
|
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna_columns(self):
|
||
|
df = DataFrame(np.random.randn(10, 10))
|
||
|
df.values[:, ::2] = np.nan
|
||
|
|
||
|
result = df.fillna(method='ffill', axis=1)
|
||
|
expected = df.T.fillna(method='pad').T
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
df.insert(6, 'foo', 5)
|
||
|
result = df.fillna(method='ffill', axis=1)
|
||
|
expected = df.astype(float).fillna(method='ffill', axis=1)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_fillna_invalid_method(self):
|
||
|
with tm.assert_raises_regex(ValueError, 'ffil'):
|
||
|
self.frame.fillna(method='ffil')
|
||
|
|
||
|
def test_fillna_invalid_value(self):
|
||
|
# list
|
||
|
pytest.raises(TypeError, self.frame.fillna, [1, 2])
|
||
|
# tuple
|
||
|
pytest.raises(TypeError, self.frame.fillna, (1, 2))
|
||
|
# frame with series
|
||
|
pytest.raises(TypeError, self.frame.iloc[:, 0].fillna, self.frame)
|
||
|
|
||
|
def test_fillna_col_reordering(self):
|
||
|
cols = ["COL." + str(i) for i in range(5, 0, -1)]
|
||
|
data = np.random.rand(20, 5)
|
||
|
df = DataFrame(index=lrange(20), columns=cols, data=data)
|
||
|
filled = df.fillna(method='ffill')
|
||
|
assert df.columns.tolist() == filled.columns.tolist()
|
||
|
|
||
|
def test_fill_corner(self):
|
||
|
mf = self.mixed_frame
|
||
|
mf.loc[mf.index[5:20], 'foo'] = nan
|
||
|
mf.loc[mf.index[-10:], 'A'] = nan
|
||
|
|
||
|
filled = self.mixed_frame.fillna(value=0)
|
||
|
assert (filled.loc[filled.index[5:20], 'foo'] == 0).all()
|
||
|
del self.mixed_frame['foo']
|
||
|
|
||
|
empty_float = self.frame.reindex(columns=[])
|
||
|
|
||
|
# TODO(wesm): unused?
|
||
|
result = empty_float.fillna(value=0) # noqa
|
||
|
|
||
|
def test_fill_value_when_combine_const(self):
|
||
|
# GH12723
|
||
|
dat = np.array([0, 1, np.nan, 3, 4, 5], dtype='float')
|
||
|
df = DataFrame({'foo': dat}, index=range(6))
|
||
|
|
||
|
exp = df.fillna(0).add(2)
|
||
|
res = df.add(2, fill_value=0)
|
||
|
assert_frame_equal(res, exp)
|
||
|
|
||
|
|
||
|
class TestDataFrameInterpolate(TestData):
|
||
|
|
||
|
def test_interp_basic(self):
|
||
|
df = DataFrame({'A': [1, 2, np.nan, 4],
|
||
|
'B': [1, 4, 9, np.nan],
|
||
|
'C': [1, 2, 3, 5],
|
||
|
'D': list('abcd')})
|
||
|
expected = DataFrame({'A': [1., 2., 3., 4.],
|
||
|
'B': [1., 4., 9., 9.],
|
||
|
'C': [1, 2, 3, 5],
|
||
|
'D': list('abcd')})
|
||
|
result = df.interpolate()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.set_index('C').interpolate()
|
||
|
expected = df.set_index('C')
|
||
|
expected.loc[3, 'A'] = 3
|
||
|
expected.loc[5, 'B'] = 9
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_interp_bad_method(self):
|
||
|
df = DataFrame({'A': [1, 2, np.nan, 4],
|
||
|
'B': [1, 4, 9, np.nan],
|
||
|
'C': [1, 2, 3, 5],
|
||
|
'D': list('abcd')})
|
||
|
with pytest.raises(ValueError):
|
||
|
df.interpolate(method='not_a_method')
|
||
|
|
||
|
def test_interp_combo(self):
|
||
|
df = DataFrame({'A': [1., 2., np.nan, 4.],
|
||
|
'B': [1, 4, 9, np.nan],
|
||
|
'C': [1, 2, 3, 5],
|
||
|
'D': list('abcd')})
|
||
|
|
||
|
result = df['A'].interpolate()
|
||
|
expected = Series([1., 2., 3., 4.], name='A')
|
||
|
assert_series_equal(result, expected)
|
||
|
|
||
|
result = df['A'].interpolate(downcast='infer')
|
||
|
expected = Series([1, 2, 3, 4], name='A')
|
||
|
assert_series_equal(result, expected)
|
||
|
|
||
|
def test_interp_nan_idx(self):
|
||
|
df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]})
|
||
|
df = df.set_index('A')
|
||
|
with pytest.raises(NotImplementedError):
|
||
|
df.interpolate(method='values')
|
||
|
|
||
|
@td.skip_if_no_scipy
|
||
|
def test_interp_various(self):
|
||
|
df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
|
||
|
'C': [1, 2, 3, 5, 8, 13, 21]})
|
||
|
df = df.set_index('C')
|
||
|
expected = df.copy()
|
||
|
result = df.interpolate(method='polynomial', order=1)
|
||
|
|
||
|
expected.A.loc[3] = 2.66666667
|
||
|
expected.A.loc[13] = 5.76923076
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.interpolate(method='cubic')
|
||
|
# GH #15662.
|
||
|
# new cubic and quadratic interpolation algorithms from scipy 0.19.0.
|
||
|
# previously `splmake` was used. See scipy/scipy#6710
|
||
|
if _is_scipy_ge_0190:
|
||
|
expected.A.loc[3] = 2.81547781
|
||
|
expected.A.loc[13] = 5.52964175
|
||
|
else:
|
||
|
expected.A.loc[3] = 2.81621174
|
||
|
expected.A.loc[13] = 5.64146581
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.interpolate(method='nearest')
|
||
|
expected.A.loc[3] = 2
|
||
|
expected.A.loc[13] = 5
|
||
|
assert_frame_equal(result, expected, check_dtype=False)
|
||
|
|
||
|
result = df.interpolate(method='quadratic')
|
||
|
if _is_scipy_ge_0190:
|
||
|
expected.A.loc[3] = 2.82150771
|
||
|
expected.A.loc[13] = 6.12648668
|
||
|
else:
|
||
|
expected.A.loc[3] = 2.82533638
|
||
|
expected.A.loc[13] = 6.02817974
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.interpolate(method='slinear')
|
||
|
expected.A.loc[3] = 2.66666667
|
||
|
expected.A.loc[13] = 5.76923077
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.interpolate(method='zero')
|
||
|
expected.A.loc[3] = 2.
|
||
|
expected.A.loc[13] = 5
|
||
|
assert_frame_equal(result, expected, check_dtype=False)
|
||
|
|
||
|
@td.skip_if_no_scipy
|
||
|
def test_interp_alt_scipy(self):
|
||
|
df = DataFrame({'A': [1, 2, np.nan, 4, 5, np.nan, 7],
|
||
|
'C': [1, 2, 3, 5, 8, 13, 21]})
|
||
|
result = df.interpolate(method='barycentric')
|
||
|
expected = df.copy()
|
||
|
expected.loc[2, 'A'] = 3
|
||
|
expected.loc[5, 'A'] = 6
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.interpolate(method='barycentric', downcast='infer')
|
||
|
assert_frame_equal(result, expected.astype(np.int64))
|
||
|
|
||
|
result = df.interpolate(method='krogh')
|
||
|
expectedk = df.copy()
|
||
|
expectedk['A'] = expected['A']
|
||
|
assert_frame_equal(result, expectedk)
|
||
|
|
||
|
_skip_if_no_pchip()
|
||
|
import scipy
|
||
|
result = df.interpolate(method='pchip')
|
||
|
expected.loc[2, 'A'] = 3
|
||
|
|
||
|
if LooseVersion(scipy.__version__) >= LooseVersion('0.17.0'):
|
||
|
expected.loc[5, 'A'] = 6.0
|
||
|
else:
|
||
|
expected.loc[5, 'A'] = 6.125
|
||
|
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_interp_rowwise(self):
|
||
|
df = DataFrame({0: [1, 2, np.nan, 4],
|
||
|
1: [2, 3, 4, np.nan],
|
||
|
2: [np.nan, 4, 5, 6],
|
||
|
3: [4, np.nan, 6, 7],
|
||
|
4: [1, 2, 3, 4]})
|
||
|
result = df.interpolate(axis=1)
|
||
|
expected = df.copy()
|
||
|
expected.loc[3, 1] = 5
|
||
|
expected.loc[0, 2] = 3
|
||
|
expected.loc[1, 3] = 3
|
||
|
expected[4] = expected[4].astype(np.float64)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.interpolate(axis=1, method='values')
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.interpolate(axis=0)
|
||
|
expected = df.interpolate()
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_rowwise_alt(self):
|
||
|
df = DataFrame({0: [0, .5, 1., np.nan, 4, 8, np.nan, np.nan, 64],
|
||
|
1: [1, 2, 3, 4, 3, 2, 1, 0, -1]})
|
||
|
df.interpolate(axis=0)
|
||
|
|
||
|
@pytest.mark.parametrize("check_scipy", [
|
||
|
False, pytest.param(True, marks=td.skip_if_no_scipy)
|
||
|
])
|
||
|
def test_interp_leading_nans(self, check_scipy):
|
||
|
df = DataFrame({"A": [np.nan, np.nan, .5, .25, 0],
|
||
|
"B": [np.nan, -3, -3.5, np.nan, -4]})
|
||
|
result = df.interpolate()
|
||
|
expected = df.copy()
|
||
|
expected['B'].loc[3] = -3.75
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
if check_scipy:
|
||
|
result = df.interpolate(method='polynomial', order=1)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_interp_raise_on_only_mixed(self):
|
||
|
df = DataFrame({'A': [1, 2, np.nan, 4],
|
||
|
'B': ['a', 'b', 'c', 'd'],
|
||
|
'C': [np.nan, 2, 5, 7],
|
||
|
'D': [np.nan, np.nan, 9, 9],
|
||
|
'E': [1, 2, 3, 4]})
|
||
|
with pytest.raises(TypeError):
|
||
|
df.interpolate(axis=1)
|
||
|
|
||
|
def test_interp_inplace(self):
|
||
|
df = DataFrame({'a': [1., 2., np.nan, 4.]})
|
||
|
expected = DataFrame({'a': [1., 2., 3., 4.]})
|
||
|
result = df.copy()
|
||
|
result['a'].interpolate(inplace=True)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
result = df.copy()
|
||
|
result['a'].interpolate(inplace=True, downcast='infer')
|
||
|
assert_frame_equal(result, expected.astype('int64'))
|
||
|
|
||
|
def test_interp_inplace_row(self):
|
||
|
# GH 10395
|
||
|
result = DataFrame({'a': [1., 2., 3., 4.],
|
||
|
'b': [np.nan, 2., 3., 4.],
|
||
|
'c': [3, 2, 2, 2]})
|
||
|
expected = result.interpolate(method='linear', axis=1, inplace=False)
|
||
|
result.interpolate(method='linear', axis=1, inplace=True)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
def test_interp_ignore_all_good(self):
|
||
|
# GH
|
||
|
df = DataFrame({'A': [1, 2, np.nan, 4],
|
||
|
'B': [1, 2, 3, 4],
|
||
|
'C': [1., 2., np.nan, 4.],
|
||
|
'D': [1., 2., 3., 4.]})
|
||
|
expected = DataFrame({'A': np.array(
|
||
|
[1, 2, 3, 4], dtype='float64'),
|
||
|
'B': np.array(
|
||
|
[1, 2, 3, 4], dtype='int64'),
|
||
|
'C': np.array(
|
||
|
[1., 2., 3, 4.], dtype='float64'),
|
||
|
'D': np.array(
|
||
|
[1., 2., 3., 4.], dtype='float64')})
|
||
|
|
||
|
result = df.interpolate(downcast=None)
|
||
|
assert_frame_equal(result, expected)
|
||
|
|
||
|
# all good
|
||
|
result = df[['B', 'D']].interpolate(downcast=None)
|
||
|
assert_frame_equal(result, df[['B', 'D']])
|