516 lines
17 KiB
Python
516 lines
17 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
|
||
|
from __future__ import print_function
|
||
|
|
||
|
import pytest
|
||
|
|
||
|
# pylint: disable-msg=W0612,E1101
|
||
|
from copy import deepcopy
|
||
|
import pydoc
|
||
|
|
||
|
from pandas.compat import range, lrange, long
|
||
|
from pandas import compat
|
||
|
|
||
|
from numpy.random import randn
|
||
|
import numpy as np
|
||
|
|
||
|
from pandas import (DataFrame, Series, date_range, timedelta_range,
|
||
|
Categorical, SparseDataFrame)
|
||
|
import pandas as pd
|
||
|
|
||
|
from pandas.util.testing import (assert_almost_equal,
|
||
|
assert_series_equal,
|
||
|
assert_frame_equal)
|
||
|
|
||
|
import pandas.util.testing as tm
|
||
|
|
||
|
from pandas.tests.frame.common import TestData
|
||
|
|
||
|
|
||
|
class SharedWithSparse(object):
|
||
|
"""
|
||
|
A collection of tests DataFrame and SparseDataFrame can share.
|
||
|
|
||
|
In generic tests on this class, use ``self._assert_frame_equal()`` and
|
||
|
``self._assert_series_equal()`` which are implemented in sub-classes
|
||
|
and dispatch correctly.
|
||
|
"""
|
||
|
def _assert_frame_equal(self, left, right):
|
||
|
"""Dispatch to frame class dependent assertion"""
|
||
|
raise NotImplementedError
|
||
|
|
||
|
def _assert_series_equal(self, left, right):
|
||
|
"""Dispatch to series class dependent assertion"""
|
||
|
raise NotImplementedError
|
||
|
|
||
|
def test_copy_index_name_checking(self):
|
||
|
# don't want to be able to modify the index stored elsewhere after
|
||
|
# making a copy
|
||
|
for attr in ('index', 'columns'):
|
||
|
ind = getattr(self.frame, attr)
|
||
|
ind.name = None
|
||
|
cp = self.frame.copy()
|
||
|
getattr(cp, attr).name = 'foo'
|
||
|
assert getattr(self.frame, attr).name is None
|
||
|
|
||
|
def test_getitem_pop_assign_name(self):
|
||
|
s = self.frame['A']
|
||
|
assert s.name == 'A'
|
||
|
|
||
|
s = self.frame.pop('A')
|
||
|
assert s.name == 'A'
|
||
|
|
||
|
s = self.frame.loc[:, 'B']
|
||
|
assert s.name == 'B'
|
||
|
|
||
|
s2 = s.loc[:]
|
||
|
assert s2.name == 'B'
|
||
|
|
||
|
def test_get_value(self):
|
||
|
for idx in self.frame.index:
|
||
|
for col in self.frame.columns:
|
||
|
with tm.assert_produces_warning(FutureWarning,
|
||
|
check_stacklevel=False):
|
||
|
result = self.frame.get_value(idx, col)
|
||
|
expected = self.frame[col][idx]
|
||
|
tm.assert_almost_equal(result, expected)
|
||
|
|
||
|
def test_add_prefix_suffix(self):
|
||
|
with_prefix = self.frame.add_prefix('foo#')
|
||
|
expected = pd.Index(['foo#%s' % c for c in self.frame.columns])
|
||
|
tm.assert_index_equal(with_prefix.columns, expected)
|
||
|
|
||
|
with_suffix = self.frame.add_suffix('#foo')
|
||
|
expected = pd.Index(['%s#foo' % c for c in self.frame.columns])
|
||
|
tm.assert_index_equal(with_suffix.columns, expected)
|
||
|
|
||
|
with_pct_prefix = self.frame.add_prefix('%')
|
||
|
expected = pd.Index(['%{}'.format(c) for c in self.frame.columns])
|
||
|
tm.assert_index_equal(with_pct_prefix.columns, expected)
|
||
|
|
||
|
with_pct_suffix = self.frame.add_suffix('%')
|
||
|
expected = pd.Index(['{}%'.format(c) for c in self.frame.columns])
|
||
|
tm.assert_index_equal(with_pct_suffix.columns, expected)
|
||
|
|
||
|
def test_get_axis(self):
|
||
|
f = self.frame
|
||
|
assert f._get_axis_number(0) == 0
|
||
|
assert f._get_axis_number(1) == 1
|
||
|
assert f._get_axis_number('index') == 0
|
||
|
assert f._get_axis_number('rows') == 0
|
||
|
assert f._get_axis_number('columns') == 1
|
||
|
|
||
|
assert f._get_axis_name(0) == 'index'
|
||
|
assert f._get_axis_name(1) == 'columns'
|
||
|
assert f._get_axis_name('index') == 'index'
|
||
|
assert f._get_axis_name('rows') == 'index'
|
||
|
assert f._get_axis_name('columns') == 'columns'
|
||
|
|
||
|
assert f._get_axis(0) is f.index
|
||
|
assert f._get_axis(1) is f.columns
|
||
|
|
||
|
tm.assert_raises_regex(
|
||
|
ValueError, 'No axis named', f._get_axis_number, 2)
|
||
|
tm.assert_raises_regex(
|
||
|
ValueError, 'No axis.*foo', f._get_axis_name, 'foo')
|
||
|
tm.assert_raises_regex(
|
||
|
ValueError, 'No axis.*None', f._get_axis_name, None)
|
||
|
tm.assert_raises_regex(ValueError, 'No axis named',
|
||
|
f._get_axis_number, None)
|
||
|
|
||
|
def test_keys(self):
|
||
|
getkeys = self.frame.keys
|
||
|
assert getkeys() is self.frame.columns
|
||
|
|
||
|
def test_column_contains_typeerror(self):
|
||
|
try:
|
||
|
self.frame.columns in self.frame
|
||
|
except TypeError:
|
||
|
pass
|
||
|
|
||
|
def test_tab_completion(self):
|
||
|
# DataFrame whose columns are identifiers shall have them in __dir__.
|
||
|
df = pd.DataFrame([list('abcd'), list('efgh')], columns=list('ABCD'))
|
||
|
for key in list('ABCD'):
|
||
|
assert key in dir(df)
|
||
|
assert isinstance(df.__getitem__('A'), pd.Series)
|
||
|
|
||
|
# DataFrame whose first-level columns are identifiers shall have
|
||
|
# them in __dir__.
|
||
|
df = pd.DataFrame(
|
||
|
[list('abcd'), list('efgh')],
|
||
|
columns=pd.MultiIndex.from_tuples(list(zip('ABCD', 'EFGH'))))
|
||
|
for key in list('ABCD'):
|
||
|
assert key in dir(df)
|
||
|
for key in list('EFGH'):
|
||
|
assert key not in dir(df)
|
||
|
assert isinstance(df.__getitem__('A'), pd.DataFrame)
|
||
|
|
||
|
def test_not_hashable(self):
|
||
|
df = self.klass([1])
|
||
|
pytest.raises(TypeError, hash, df)
|
||
|
pytest.raises(TypeError, hash, self.empty)
|
||
|
|
||
|
def test_new_empty_index(self):
|
||
|
df1 = self.klass(randn(0, 3))
|
||
|
df2 = self.klass(randn(0, 3))
|
||
|
df1.index.name = 'foo'
|
||
|
assert df2.index.name is None
|
||
|
|
||
|
def test_array_interface(self):
|
||
|
with np.errstate(all='ignore'):
|
||
|
result = np.sqrt(self.frame)
|
||
|
assert isinstance(result, type(self.frame))
|
||
|
assert result.index is self.frame.index
|
||
|
assert result.columns is self.frame.columns
|
||
|
|
||
|
self._assert_frame_equal(result, self.frame.apply(np.sqrt))
|
||
|
|
||
|
def test_get_agg_axis(self):
|
||
|
cols = self.frame._get_agg_axis(0)
|
||
|
assert cols is self.frame.columns
|
||
|
|
||
|
idx = self.frame._get_agg_axis(1)
|
||
|
assert idx is self.frame.index
|
||
|
|
||
|
pytest.raises(ValueError, self.frame._get_agg_axis, 2)
|
||
|
|
||
|
def test_nonzero(self):
|
||
|
assert self.empty.empty
|
||
|
|
||
|
assert not self.frame.empty
|
||
|
assert not self.mixed_frame.empty
|
||
|
|
||
|
# corner case
|
||
|
df = DataFrame({'A': [1., 2., 3.],
|
||
|
'B': ['a', 'b', 'c']},
|
||
|
index=np.arange(3))
|
||
|
del df['A']
|
||
|
assert not df.empty
|
||
|
|
||
|
def test_iteritems(self):
|
||
|
df = self.klass([[1, 2, 3], [4, 5, 6]], columns=['a', 'a', 'b'])
|
||
|
for k, v in compat.iteritems(df):
|
||
|
assert isinstance(v, self.klass._constructor_sliced)
|
||
|
|
||
|
def test_items(self):
|
||
|
# issue #17213, #13918
|
||
|
cols = ['a', 'b', 'c']
|
||
|
df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=cols)
|
||
|
for c, (k, v) in zip(cols, df.items()):
|
||
|
assert c == k
|
||
|
assert isinstance(v, Series)
|
||
|
assert (df[k] == v).all()
|
||
|
|
||
|
def test_iter(self):
|
||
|
assert tm.equalContents(list(self.frame), self.frame.columns)
|
||
|
|
||
|
def test_iterrows(self):
|
||
|
for k, v in self.frame.iterrows():
|
||
|
exp = self.frame.loc[k]
|
||
|
self._assert_series_equal(v, exp)
|
||
|
|
||
|
for k, v in self.mixed_frame.iterrows():
|
||
|
exp = self.mixed_frame.loc[k]
|
||
|
self._assert_series_equal(v, exp)
|
||
|
|
||
|
def test_iterrows_iso8601(self):
|
||
|
# GH19671
|
||
|
if self.klass == SparseDataFrame:
|
||
|
pytest.xfail(reason='SparseBlock datetime type not implemented.')
|
||
|
|
||
|
s = self.klass(
|
||
|
{'non_iso8601': ['M1701', 'M1802', 'M1903', 'M2004'],
|
||
|
'iso8601': date_range('2000-01-01', periods=4, freq='M')})
|
||
|
for k, v in s.iterrows():
|
||
|
exp = s.loc[k]
|
||
|
self._assert_series_equal(v, exp)
|
||
|
|
||
|
def test_itertuples(self):
|
||
|
for i, tup in enumerate(self.frame.itertuples()):
|
||
|
s = self.klass._constructor_sliced(tup[1:])
|
||
|
s.name = tup[0]
|
||
|
expected = self.frame.iloc[i, :].reset_index(drop=True)
|
||
|
self._assert_series_equal(s, expected)
|
||
|
|
||
|
df = self.klass({'floats': np.random.randn(5),
|
||
|
'ints': lrange(5)}, columns=['floats', 'ints'])
|
||
|
|
||
|
for tup in df.itertuples(index=False):
|
||
|
assert isinstance(tup[1], (int, long))
|
||
|
|
||
|
df = self.klass(data={"a": [1, 2, 3], "b": [4, 5, 6]})
|
||
|
dfaa = df[['a', 'a']]
|
||
|
|
||
|
assert (list(dfaa.itertuples()) ==
|
||
|
[(0, 1, 1), (1, 2, 2), (2, 3, 3)])
|
||
|
|
||
|
# repr with be int/long on 32-bit/windows
|
||
|
if not (compat.is_platform_windows() or compat.is_platform_32bit()):
|
||
|
assert (repr(list(df.itertuples(name=None))) ==
|
||
|
'[(0, 1, 4), (1, 2, 5), (2, 3, 6)]')
|
||
|
|
||
|
tup = next(df.itertuples(name='TestName'))
|
||
|
assert tup._fields == ('Index', 'a', 'b')
|
||
|
assert (tup.Index, tup.a, tup.b) == tup
|
||
|
assert type(tup).__name__ == 'TestName'
|
||
|
|
||
|
df.columns = ['def', 'return']
|
||
|
tup2 = next(df.itertuples(name='TestName'))
|
||
|
assert tup2 == (0, 1, 4)
|
||
|
assert tup2._fields == ('Index', '_1', '_2')
|
||
|
|
||
|
df3 = DataFrame({'f' + str(i): [i] for i in range(1024)})
|
||
|
# will raise SyntaxError if trying to create namedtuple
|
||
|
tup3 = next(df3.itertuples())
|
||
|
assert not hasattr(tup3, '_fields')
|
||
|
assert isinstance(tup3, tuple)
|
||
|
|
||
|
def test_sequence_like_with_categorical(self):
|
||
|
|
||
|
# GH 7839
|
||
|
# make sure can iterate
|
||
|
df = DataFrame({"id": [1, 2, 3, 4, 5, 6],
|
||
|
"raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
|
||
|
df['grade'] = Categorical(df['raw_grade'])
|
||
|
|
||
|
# basic sequencing testing
|
||
|
result = list(df.grade.values)
|
||
|
expected = np.array(df.grade.values).tolist()
|
||
|
tm.assert_almost_equal(result, expected)
|
||
|
|
||
|
# iteration
|
||
|
for t in df.itertuples(index=False):
|
||
|
str(t)
|
||
|
|
||
|
for row, s in df.iterrows():
|
||
|
str(s)
|
||
|
|
||
|
for c, col in df.iteritems():
|
||
|
str(s)
|
||
|
|
||
|
def test_len(self):
|
||
|
assert len(self.frame) == len(self.frame.index)
|
||
|
|
||
|
def test_values(self):
|
||
|
frame = self.frame
|
||
|
arr = frame.values
|
||
|
|
||
|
frame_cols = frame.columns
|
||
|
for i, row in enumerate(arr):
|
||
|
for j, value in enumerate(row):
|
||
|
col = frame_cols[j]
|
||
|
if np.isnan(value):
|
||
|
assert np.isnan(frame[col][i])
|
||
|
else:
|
||
|
assert value == frame[col][i]
|
||
|
|
||
|
# mixed type
|
||
|
arr = self.mixed_frame[['foo', 'A']].values
|
||
|
assert arr[0, 0] == 'bar'
|
||
|
|
||
|
df = self.klass({'real': [1, 2, 3], 'complex': [1j, 2j, 3j]})
|
||
|
arr = df.values
|
||
|
assert arr[0, 0] == 1j
|
||
|
|
||
|
# single block corner case
|
||
|
arr = self.frame[['A', 'B']].values
|
||
|
expected = self.frame.reindex(columns=['A', 'B']).values
|
||
|
assert_almost_equal(arr, expected)
|
||
|
|
||
|
def test_transpose(self):
|
||
|
frame = self.frame
|
||
|
dft = frame.T
|
||
|
for idx, series in compat.iteritems(dft):
|
||
|
for col, value in compat.iteritems(series):
|
||
|
if np.isnan(value):
|
||
|
assert np.isnan(frame[col][idx])
|
||
|
else:
|
||
|
assert value == frame[col][idx]
|
||
|
|
||
|
# mixed type
|
||
|
index, data = tm.getMixedTypeDict()
|
||
|
mixed = self.klass(data, index=index)
|
||
|
|
||
|
mixed_T = mixed.T
|
||
|
for col, s in compat.iteritems(mixed_T):
|
||
|
assert s.dtype == np.object_
|
||
|
|
||
|
def test_swapaxes(self):
|
||
|
df = self.klass(np.random.randn(10, 5))
|
||
|
self._assert_frame_equal(df.T, df.swapaxes(0, 1))
|
||
|
self._assert_frame_equal(df.T, df.swapaxes(1, 0))
|
||
|
self._assert_frame_equal(df, df.swapaxes(0, 0))
|
||
|
pytest.raises(ValueError, df.swapaxes, 2, 5)
|
||
|
|
||
|
def test_axis_aliases(self):
|
||
|
f = self.frame
|
||
|
|
||
|
# reg name
|
||
|
expected = f.sum(axis=0)
|
||
|
result = f.sum(axis='index')
|
||
|
assert_series_equal(result, expected)
|
||
|
|
||
|
expected = f.sum(axis=1)
|
||
|
result = f.sum(axis='columns')
|
||
|
assert_series_equal(result, expected)
|
||
|
|
||
|
def test_class_axis(self):
|
||
|
# https://github.com/pandas-dev/pandas/issues/18147
|
||
|
# no exception and no empty docstring
|
||
|
assert pydoc.getdoc(DataFrame.index)
|
||
|
assert pydoc.getdoc(DataFrame.columns)
|
||
|
|
||
|
def test_more_values(self):
|
||
|
values = self.mixed_frame.values
|
||
|
assert values.shape[1] == len(self.mixed_frame.columns)
|
||
|
|
||
|
def test_repr_with_mi_nat(self):
|
||
|
df = self.klass({'X': [1, 2]},
|
||
|
index=[[pd.NaT, pd.Timestamp('20130101')], ['a', 'b']])
|
||
|
res = repr(df)
|
||
|
exp = ' X\nNaT a 1\n2013-01-01 b 2'
|
||
|
assert res == exp
|
||
|
|
||
|
def test_iteritems_names(self):
|
||
|
for k, v in compat.iteritems(self.mixed_frame):
|
||
|
assert v.name == k
|
||
|
|
||
|
def test_series_put_names(self):
|
||
|
series = self.mixed_frame._series
|
||
|
for k, v in compat.iteritems(series):
|
||
|
assert v.name == k
|
||
|
|
||
|
def test_empty_nonzero(self):
|
||
|
df = self.klass([1, 2, 3])
|
||
|
assert not df.empty
|
||
|
df = self.klass(index=[1], columns=[1])
|
||
|
assert not df.empty
|
||
|
df = self.klass(index=['a', 'b'], columns=['c', 'd']).dropna()
|
||
|
assert df.empty
|
||
|
assert df.T.empty
|
||
|
empty_frames = [self.klass(),
|
||
|
self.klass(index=[1]),
|
||
|
self.klass(columns=[1]),
|
||
|
self.klass({1: []})]
|
||
|
for df in empty_frames:
|
||
|
assert df.empty
|
||
|
assert df.T.empty
|
||
|
|
||
|
def test_with_datetimelikes(self):
|
||
|
|
||
|
df = self.klass({'A': date_range('20130101', periods=10),
|
||
|
'B': timedelta_range('1 day', periods=10)})
|
||
|
t = df.T
|
||
|
|
||
|
result = t.get_dtype_counts()
|
||
|
expected = Series({'object': 10})
|
||
|
tm.assert_series_equal(result, expected)
|
||
|
|
||
|
|
||
|
class TestDataFrameMisc(SharedWithSparse, TestData):
|
||
|
|
||
|
klass = DataFrame
|
||
|
# SharedWithSparse tests use generic, klass-agnostic assertion
|
||
|
_assert_frame_equal = staticmethod(assert_frame_equal)
|
||
|
_assert_series_equal = staticmethod(assert_series_equal)
|
||
|
|
||
|
def test_values(self):
|
||
|
self.frame.values[:, 0] = 5.
|
||
|
assert (self.frame.values[:, 0] == 5).all()
|
||
|
|
||
|
def test_as_matrix_deprecated(self):
|
||
|
# GH18458
|
||
|
with tm.assert_produces_warning(FutureWarning):
|
||
|
result = self.frame.as_matrix(columns=self.frame.columns.tolist())
|
||
|
expected = self.frame.values
|
||
|
tm.assert_numpy_array_equal(result, expected)
|
||
|
|
||
|
def test_deepcopy(self):
|
||
|
cp = deepcopy(self.frame)
|
||
|
series = cp['A']
|
||
|
series[:] = 10
|
||
|
for idx, value in compat.iteritems(series):
|
||
|
assert self.frame['A'][idx] != value
|
||
|
|
||
|
def test_transpose_get_view(self):
|
||
|
dft = self.frame.T
|
||
|
dft.values[:, 5:10] = 5
|
||
|
|
||
|
assert (self.frame.values[5:10] == 5).all()
|
||
|
|
||
|
def test_inplace_return_self(self):
|
||
|
# re #1893
|
||
|
|
||
|
data = DataFrame({'a': ['foo', 'bar', 'baz', 'qux'],
|
||
|
'b': [0, 0, 1, 1],
|
||
|
'c': [1, 2, 3, 4]})
|
||
|
|
||
|
def _check_f(base, f):
|
||
|
result = f(base)
|
||
|
assert result is None
|
||
|
|
||
|
# -----DataFrame-----
|
||
|
|
||
|
# set_index
|
||
|
f = lambda x: x.set_index('a', inplace=True)
|
||
|
_check_f(data.copy(), f)
|
||
|
|
||
|
# reset_index
|
||
|
f = lambda x: x.reset_index(inplace=True)
|
||
|
_check_f(data.set_index('a'), f)
|
||
|
|
||
|
# drop_duplicates
|
||
|
f = lambda x: x.drop_duplicates(inplace=True)
|
||
|
_check_f(data.copy(), f)
|
||
|
|
||
|
# sort
|
||
|
f = lambda x: x.sort_values('b', inplace=True)
|
||
|
_check_f(data.copy(), f)
|
||
|
|
||
|
# sort_index
|
||
|
f = lambda x: x.sort_index(inplace=True)
|
||
|
_check_f(data.copy(), f)
|
||
|
|
||
|
# fillna
|
||
|
f = lambda x: x.fillna(0, inplace=True)
|
||
|
_check_f(data.copy(), f)
|
||
|
|
||
|
# replace
|
||
|
f = lambda x: x.replace(1, 0, inplace=True)
|
||
|
_check_f(data.copy(), f)
|
||
|
|
||
|
# rename
|
||
|
f = lambda x: x.rename({1: 'foo'}, inplace=True)
|
||
|
_check_f(data.copy(), f)
|
||
|
|
||
|
# -----Series-----
|
||
|
d = data.copy()['c']
|
||
|
|
||
|
# reset_index
|
||
|
f = lambda x: x.reset_index(inplace=True, drop=True)
|
||
|
_check_f(data.set_index('a')['c'], f)
|
||
|
|
||
|
# fillna
|
||
|
f = lambda x: x.fillna(0, inplace=True)
|
||
|
_check_f(d.copy(), f)
|
||
|
|
||
|
# replace
|
||
|
f = lambda x: x.replace(1, 0, inplace=True)
|
||
|
_check_f(d.copy(), f)
|
||
|
|
||
|
# rename
|
||
|
f = lambda x: x.rename({1: 'foo'}, inplace=True)
|
||
|
_check_f(d.copy(), f)
|
||
|
|
||
|
def test_tab_complete_warning(self, ip):
|
||
|
# https://github.com/pandas-dev/pandas/issues/16409
|
||
|
pytest.importorskip('IPython', minversion="6.0.0")
|
||
|
from IPython.core.completer import provisionalcompleter
|
||
|
|
||
|
code = "import pandas as pd; df = pd.DataFrame()"
|
||
|
ip.run_code(code)
|
||
|
with tm.assert_produces_warning(None):
|
||
|
with provisionalcompleter('ignore'):
|
||
|
list(ip.Completer.completions('df.', 1))
|