2474 lines
90 KiB
Python
2474 lines
90 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
from __future__ import print_function
|
|
|
|
import warnings
|
|
from datetime import timedelta
|
|
import operator
|
|
import pytest
|
|
|
|
from string import ascii_lowercase
|
|
from numpy import nan
|
|
from numpy.random import randn
|
|
import numpy as np
|
|
|
|
from pandas.compat import lrange, PY35
|
|
from pandas import (compat, isna, notna, DataFrame, Series,
|
|
MultiIndex, date_range, Timestamp, Categorical,
|
|
_np_version_under1p12)
|
|
import pandas as pd
|
|
import pandas.core.nanops as nanops
|
|
import pandas.core.algorithms as algorithms
|
|
import pandas.io.formats.printing as printing
|
|
|
|
import pandas.util.testing as tm
|
|
import pandas.util._test_decorators as td
|
|
from pandas.tests.frame.common import TestData
|
|
|
|
|
|
class TestDataFrameAnalytics(TestData):
|
|
|
|
# ---------------------------------------------------------------------=
|
|
# Correlation and covariance
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corr_pearson(self):
|
|
self.frame['A'][:5] = nan
|
|
self.frame['B'][5:10] = nan
|
|
|
|
self._check_method('pearson')
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corr_kendall(self):
|
|
self.frame['A'][:5] = nan
|
|
self.frame['B'][5:10] = nan
|
|
|
|
self._check_method('kendall')
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corr_spearman(self):
|
|
self.frame['A'][:5] = nan
|
|
self.frame['B'][5:10] = nan
|
|
|
|
self._check_method('spearman')
|
|
|
|
def _check_method(self, method='pearson', check_minp=False):
|
|
if not check_minp:
|
|
correls = self.frame.corr(method=method)
|
|
exp = self.frame['A'].corr(self.frame['C'], method=method)
|
|
tm.assert_almost_equal(correls['A']['C'], exp)
|
|
else:
|
|
result = self.frame.corr(min_periods=len(self.frame) - 8)
|
|
expected = self.frame.corr()
|
|
expected.loc['A', 'B'] = expected.loc['B', 'A'] = nan
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corr_non_numeric(self):
|
|
self.frame['A'][:5] = nan
|
|
self.frame['B'][5:10] = nan
|
|
|
|
# exclude non-numeric types
|
|
result = self.mixed_frame.corr()
|
|
expected = self.mixed_frame.loc[:, ['A', 'B', 'C', 'D']].corr()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corr_nooverlap(self):
|
|
# nothing in common
|
|
for meth in ['pearson', 'kendall', 'spearman']:
|
|
df = DataFrame({'A': [1, 1.5, 1, np.nan, np.nan, np.nan],
|
|
'B': [np.nan, np.nan, np.nan, 1, 1.5, 1],
|
|
'C': [np.nan, np.nan, np.nan, np.nan,
|
|
np.nan, np.nan]})
|
|
rs = df.corr(meth)
|
|
assert isna(rs.loc['A', 'B'])
|
|
assert isna(rs.loc['B', 'A'])
|
|
assert rs.loc['A', 'A'] == 1
|
|
assert rs.loc['B', 'B'] == 1
|
|
assert isna(rs.loc['C', 'C'])
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corr_constant(self):
|
|
# constant --> all NA
|
|
|
|
for meth in ['pearson', 'spearman']:
|
|
df = DataFrame({'A': [1, 1, 1, np.nan, np.nan, np.nan],
|
|
'B': [np.nan, np.nan, np.nan, 1, 1, 1]})
|
|
rs = df.corr(meth)
|
|
assert isna(rs.values).all()
|
|
|
|
def test_corr_int(self):
|
|
# dtypes other than float64 #1761
|
|
df3 = DataFrame({"a": [1, 2, 3, 4], "b": [1, 2, 3, 4]})
|
|
|
|
df3.cov()
|
|
df3.corr()
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_corr_int_and_boolean(self):
|
|
# when dtypes of pandas series are different
|
|
# then ndarray will have dtype=object,
|
|
# so it need to be properly handled
|
|
df = DataFrame({"a": [True, False], "b": [1, 0]})
|
|
|
|
expected = DataFrame(np.ones((2, 2)), index=[
|
|
'a', 'b'], columns=['a', 'b'])
|
|
for meth in ['pearson', 'kendall', 'spearman']:
|
|
|
|
# RuntimeWarning
|
|
with warnings.catch_warnings(record=True):
|
|
result = df.corr(meth)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_corr_cov_independent_index_column(self):
|
|
# GH 14617
|
|
df = pd.DataFrame(np.random.randn(4 * 10).reshape(10, 4),
|
|
columns=list("abcd"))
|
|
for method in ['cov', 'corr']:
|
|
result = getattr(df, method)()
|
|
assert result.index is not result.columns
|
|
assert result.index.equals(result.columns)
|
|
|
|
def test_cov(self):
|
|
# min_periods no NAs (corner case)
|
|
expected = self.frame.cov()
|
|
result = self.frame.cov(min_periods=len(self.frame))
|
|
|
|
tm.assert_frame_equal(expected, result)
|
|
|
|
result = self.frame.cov(min_periods=len(self.frame) + 1)
|
|
assert isna(result.values).all()
|
|
|
|
# with NAs
|
|
frame = self.frame.copy()
|
|
frame['A'][:5] = nan
|
|
frame['B'][5:10] = nan
|
|
result = self.frame.cov(min_periods=len(self.frame) - 8)
|
|
expected = self.frame.cov()
|
|
expected.loc['A', 'B'] = np.nan
|
|
expected.loc['B', 'A'] = np.nan
|
|
|
|
# regular
|
|
self.frame['A'][:5] = nan
|
|
self.frame['B'][:10] = nan
|
|
cov = self.frame.cov()
|
|
|
|
tm.assert_almost_equal(cov['A']['C'],
|
|
self.frame['A'].cov(self.frame['C']))
|
|
|
|
# exclude non-numeric types
|
|
result = self.mixed_frame.cov()
|
|
expected = self.mixed_frame.loc[:, ['A', 'B', 'C', 'D']].cov()
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Single column frame
|
|
df = DataFrame(np.linspace(0.0, 1.0, 10))
|
|
result = df.cov()
|
|
expected = DataFrame(np.cov(df.values.T).reshape((1, 1)),
|
|
index=df.columns, columns=df.columns)
|
|
tm.assert_frame_equal(result, expected)
|
|
df.loc[0] = np.nan
|
|
result = df.cov()
|
|
expected = DataFrame(np.cov(df.values[1:].T).reshape((1, 1)),
|
|
index=df.columns, columns=df.columns)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_corrwith(self):
|
|
a = self.tsframe
|
|
noise = Series(randn(len(a)), index=a.index)
|
|
|
|
b = self.tsframe.add(noise, axis=0)
|
|
|
|
# make sure order does not matter
|
|
b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:])
|
|
del b['B']
|
|
|
|
colcorr = a.corrwith(b, axis=0)
|
|
tm.assert_almost_equal(colcorr['A'], a['A'].corr(b['A']))
|
|
|
|
rowcorr = a.corrwith(b, axis=1)
|
|
tm.assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0))
|
|
|
|
dropped = a.corrwith(b, axis=0, drop=True)
|
|
tm.assert_almost_equal(dropped['A'], a['A'].corr(b['A']))
|
|
assert 'B' not in dropped
|
|
|
|
dropped = a.corrwith(b, axis=1, drop=True)
|
|
assert a.index[-1] not in dropped.index
|
|
|
|
# non time-series data
|
|
index = ['a', 'b', 'c', 'd', 'e']
|
|
columns = ['one', 'two', 'three', 'four']
|
|
df1 = DataFrame(randn(5, 4), index=index, columns=columns)
|
|
df2 = DataFrame(randn(4, 4), index=index[:4], columns=columns)
|
|
correls = df1.corrwith(df2, axis=1)
|
|
for row in index[:4]:
|
|
tm.assert_almost_equal(correls[row],
|
|
df1.loc[row].corr(df2.loc[row]))
|
|
|
|
def test_corrwith_with_objects(self):
|
|
df1 = tm.makeTimeDataFrame()
|
|
df2 = tm.makeTimeDataFrame()
|
|
cols = ['A', 'B', 'C', 'D']
|
|
|
|
df1['obj'] = 'foo'
|
|
df2['obj'] = 'bar'
|
|
|
|
result = df1.corrwith(df2)
|
|
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df1.corrwith(df2, axis=1)
|
|
expected = df1.loc[:, cols].corrwith(df2.loc[:, cols], axis=1)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_corrwith_series(self):
|
|
result = self.tsframe.corrwith(self.tsframe['A'])
|
|
expected = self.tsframe.apply(self.tsframe['A'].corr)
|
|
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_corrwith_matches_corrcoef(self):
|
|
df1 = DataFrame(np.arange(10000), columns=['a'])
|
|
df2 = DataFrame(np.arange(10000) ** 2, columns=['a'])
|
|
c1 = df1.corrwith(df2)['a']
|
|
c2 = np.corrcoef(df1['a'], df2['a'])[0][1]
|
|
|
|
tm.assert_almost_equal(c1, c2)
|
|
assert c1 < 1
|
|
|
|
def test_corrwith_mixed_dtypes(self):
|
|
# GH 18570
|
|
df = pd.DataFrame({'a': [1, 4, 3, 2], 'b': [4, 6, 7, 3],
|
|
'c': ['a', 'b', 'c', 'd']})
|
|
s = pd.Series([0, 6, 7, 3])
|
|
result = df.corrwith(s)
|
|
corrs = [df['a'].corr(s), df['b'].corr(s)]
|
|
expected = pd.Series(data=corrs, index=['a', 'b'])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_bool_describe_in_mixed_frame(self):
|
|
df = DataFrame({
|
|
'string_data': ['a', 'b', 'c', 'd', 'e'],
|
|
'bool_data': [True, True, False, False, False],
|
|
'int_data': [10, 20, 30, 40, 50],
|
|
})
|
|
|
|
# Integer data are included in .describe() output,
|
|
# Boolean and string data are not.
|
|
result = df.describe()
|
|
expected = DataFrame({'int_data': [5, 30, df.int_data.std(),
|
|
10, 20, 30, 40, 50]},
|
|
index=['count', 'mean', 'std', 'min', '25%',
|
|
'50%', '75%', 'max'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Top value is a boolean value that is False
|
|
result = df.describe(include=['bool'])
|
|
|
|
expected = DataFrame({'bool_data': [5, 2, False, 3]},
|
|
index=['count', 'unique', 'top', 'freq'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_describe_bool_frame(self):
|
|
# GH 13891
|
|
df = pd.DataFrame({
|
|
'bool_data_1': [False, False, True, True],
|
|
'bool_data_2': [False, True, True, True]
|
|
})
|
|
result = df.describe()
|
|
expected = DataFrame({'bool_data_1': [4, 2, True, 2],
|
|
'bool_data_2': [4, 2, True, 3]},
|
|
index=['count', 'unique', 'top', 'freq'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
df = pd.DataFrame({
|
|
'bool_data': [False, False, True, True, False],
|
|
'int_data': [0, 1, 2, 3, 4]
|
|
})
|
|
result = df.describe()
|
|
expected = DataFrame({'int_data': [5, 2, df.int_data.std(), 0, 1,
|
|
2, 3, 4]},
|
|
index=['count', 'mean', 'std', 'min', '25%',
|
|
'50%', '75%', 'max'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
df = pd.DataFrame({
|
|
'bool_data': [False, False, True, True],
|
|
'str_data': ['a', 'b', 'c', 'a']
|
|
})
|
|
result = df.describe()
|
|
expected = DataFrame({'bool_data': [4, 2, True, 2],
|
|
'str_data': [4, 3, 'a', 2]},
|
|
index=['count', 'unique', 'top', 'freq'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_describe_categorical(self):
|
|
df = DataFrame({'value': np.random.randint(0, 10000, 100)})
|
|
labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
|
|
cat_labels = Categorical(labels, labels)
|
|
|
|
df = df.sort_values(by=['value'], ascending=True)
|
|
df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
|
|
right=False, labels=cat_labels)
|
|
cat = df
|
|
|
|
# Categoricals should not show up together with numerical columns
|
|
result = cat.describe()
|
|
assert len(result.columns) == 1
|
|
|
|
# In a frame, describe() for the cat should be the same as for string
|
|
# arrays (count, unique, top, freq)
|
|
|
|
cat = Categorical(["a", "b", "b", "b"], categories=['a', 'b', 'c'],
|
|
ordered=True)
|
|
s = Series(cat)
|
|
result = s.describe()
|
|
expected = Series([4, 2, "b", 3],
|
|
index=['count', 'unique', 'top', 'freq'])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
cat = Series(Categorical(["a", "b", "c", "c"]))
|
|
df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
|
|
res = df3.describe()
|
|
tm.assert_numpy_array_equal(res["cat"].values, res["s"].values)
|
|
|
|
def test_describe_categorical_columns(self):
|
|
# GH 11558
|
|
columns = pd.CategoricalIndex(['int1', 'int2', 'obj'],
|
|
ordered=True, name='XXX')
|
|
df = DataFrame({'int1': [10, 20, 30, 40, 50],
|
|
'int2': [10, 20, 30, 40, 50],
|
|
'obj': ['A', 0, None, 'X', 1]},
|
|
columns=columns)
|
|
result = df.describe()
|
|
|
|
exp_columns = pd.CategoricalIndex(['int1', 'int2'],
|
|
categories=['int1', 'int2', 'obj'],
|
|
ordered=True, name='XXX')
|
|
expected = DataFrame({'int1': [5, 30, df.int1.std(),
|
|
10, 20, 30, 40, 50],
|
|
'int2': [5, 30, df.int2.std(),
|
|
10, 20, 30, 40, 50]},
|
|
index=['count', 'mean', 'std', 'min', '25%',
|
|
'50%', '75%', 'max'],
|
|
columns=exp_columns)
|
|
tm.assert_frame_equal(result, expected)
|
|
tm.assert_categorical_equal(result.columns.values,
|
|
expected.columns.values)
|
|
|
|
def test_describe_datetime_columns(self):
|
|
columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01'],
|
|
freq='MS', tz='US/Eastern', name='XXX')
|
|
df = DataFrame({0: [10, 20, 30, 40, 50],
|
|
1: [10, 20, 30, 40, 50],
|
|
2: ['A', 0, None, 'X', 1]})
|
|
df.columns = columns
|
|
result = df.describe()
|
|
|
|
exp_columns = pd.DatetimeIndex(['2011-01-01', '2011-02-01'],
|
|
freq='MS', tz='US/Eastern', name='XXX')
|
|
expected = DataFrame({0: [5, 30, df.iloc[:, 0].std(),
|
|
10, 20, 30, 40, 50],
|
|
1: [5, 30, df.iloc[:, 1].std(),
|
|
10, 20, 30, 40, 50]},
|
|
index=['count', 'mean', 'std', 'min', '25%',
|
|
'50%', '75%', 'max'])
|
|
expected.columns = exp_columns
|
|
tm.assert_frame_equal(result, expected)
|
|
assert result.columns.freq == 'MS'
|
|
assert result.columns.tz == expected.columns.tz
|
|
|
|
def test_describe_timedelta_values(self):
|
|
# GH 6145
|
|
t1 = pd.timedelta_range('1 days', freq='D', periods=5)
|
|
t2 = pd.timedelta_range('1 hours', freq='H', periods=5)
|
|
df = pd.DataFrame({'t1': t1, 't2': t2})
|
|
|
|
expected = DataFrame({'t1': [5, pd.Timedelta('3 days'),
|
|
df.iloc[:, 0].std(),
|
|
pd.Timedelta('1 days'),
|
|
pd.Timedelta('2 days'),
|
|
pd.Timedelta('3 days'),
|
|
pd.Timedelta('4 days'),
|
|
pd.Timedelta('5 days')],
|
|
't2': [5, pd.Timedelta('3 hours'),
|
|
df.iloc[:, 1].std(),
|
|
pd.Timedelta('1 hours'),
|
|
pd.Timedelta('2 hours'),
|
|
pd.Timedelta('3 hours'),
|
|
pd.Timedelta('4 hours'),
|
|
pd.Timedelta('5 hours')]},
|
|
index=['count', 'mean', 'std', 'min', '25%',
|
|
'50%', '75%', 'max'])
|
|
|
|
res = df.describe()
|
|
tm.assert_frame_equal(res, expected)
|
|
|
|
exp_repr = (" t1 t2\n"
|
|
"count 5 5\n"
|
|
"mean 3 days 00:00:00 0 days 03:00:00\n"
|
|
"std 1 days 13:56:50.394919 0 days 01:34:52.099788\n"
|
|
"min 1 days 00:00:00 0 days 01:00:00\n"
|
|
"25% 2 days 00:00:00 0 days 02:00:00\n"
|
|
"50% 3 days 00:00:00 0 days 03:00:00\n"
|
|
"75% 4 days 00:00:00 0 days 04:00:00\n"
|
|
"max 5 days 00:00:00 0 days 05:00:00")
|
|
assert repr(res) == exp_repr
|
|
|
|
def test_reduce_mixed_frame(self):
|
|
# GH 6806
|
|
df = DataFrame({
|
|
'bool_data': [True, True, False, False, False],
|
|
'int_data': [10, 20, 30, 40, 50],
|
|
'string_data': ['a', 'b', 'c', 'd', 'e'],
|
|
})
|
|
df.reindex(columns=['bool_data', 'int_data', 'string_data'])
|
|
test = df.sum(axis=0)
|
|
tm.assert_numpy_array_equal(test.values,
|
|
np.array([2, 150, 'abcde'], dtype=object))
|
|
tm.assert_series_equal(test, df.T.sum(axis=1))
|
|
|
|
def test_count(self):
|
|
f = lambda s: notna(s).sum()
|
|
self._check_stat_op('count', f,
|
|
has_skipna=False,
|
|
has_numeric_only=True,
|
|
check_dtype=False,
|
|
check_dates=True)
|
|
|
|
# corner case
|
|
frame = DataFrame()
|
|
ct1 = frame.count(1)
|
|
assert isinstance(ct1, Series)
|
|
|
|
ct2 = frame.count(0)
|
|
assert isinstance(ct2, Series)
|
|
|
|
# GH #423
|
|
df = DataFrame(index=lrange(10))
|
|
result = df.count(1)
|
|
expected = Series(0, index=df.index)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
df = DataFrame(columns=lrange(10))
|
|
result = df.count(0)
|
|
expected = Series(0, index=df.columns)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
df = DataFrame()
|
|
result = df.count()
|
|
expected = Series(0, index=[])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_nunique(self):
|
|
f = lambda s: len(algorithms.unique1d(s.dropna()))
|
|
self._check_stat_op('nunique', f, has_skipna=False,
|
|
check_dtype=False, check_dates=True)
|
|
|
|
df = DataFrame({'A': [1, 1, 1],
|
|
'B': [1, 2, 3],
|
|
'C': [1, np.nan, 3]})
|
|
tm.assert_series_equal(df.nunique(), Series({'A': 1, 'B': 3, 'C': 2}))
|
|
tm.assert_series_equal(df.nunique(dropna=False),
|
|
Series({'A': 1, 'B': 3, 'C': 3}))
|
|
tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2}))
|
|
tm.assert_series_equal(df.nunique(axis=1, dropna=False),
|
|
Series({0: 1, 1: 3, 2: 2}))
|
|
|
|
def test_sum(self):
|
|
self._check_stat_op('sum', np.sum, has_numeric_only=True,
|
|
skipna_alternative=np.nansum)
|
|
|
|
# mixed types (with upcasting happening)
|
|
self._check_stat_op('sum', np.sum,
|
|
frame=self.mixed_float.astype('float32'),
|
|
has_numeric_only=True, check_dtype=False,
|
|
check_less_precise=True)
|
|
|
|
@pytest.mark.parametrize(
|
|
"method", ['sum', 'mean', 'prod', 'var',
|
|
'std', 'skew', 'min', 'max'])
|
|
def test_stat_operators_attempt_obj_array(self, method):
|
|
# GH #676
|
|
data = {
|
|
'a': [-0.00049987540199591344, -0.0016467257772919831,
|
|
0.00067695870775883013],
|
|
'b': [-0, -0, 0.0],
|
|
'c': [0.00031111847529610595, 0.0014902627951905339,
|
|
-0.00094099200035979691]
|
|
}
|
|
df1 = DataFrame(data, index=['foo', 'bar', 'baz'],
|
|
dtype='O')
|
|
|
|
df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3],
|
|
2: [np.nan, 4]}, dtype=object)
|
|
|
|
for df in [df1, df2]:
|
|
assert df.values.dtype == np.object_
|
|
result = getattr(df, method)(1)
|
|
expected = getattr(df.astype('f8'), method)(1)
|
|
|
|
if method in ['sum', 'prod']:
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_mean(self):
|
|
self._check_stat_op('mean', np.mean, check_dates=True)
|
|
|
|
def test_product(self):
|
|
self._check_stat_op('product', np.prod)
|
|
|
|
def test_median(self):
|
|
def wrapper(x):
|
|
if isna(x).any():
|
|
return np.nan
|
|
return np.median(x)
|
|
|
|
self._check_stat_op('median', wrapper, check_dates=True)
|
|
|
|
def test_min(self):
|
|
with warnings.catch_warnings(record=True):
|
|
self._check_stat_op('min', np.min, check_dates=True)
|
|
self._check_stat_op('min', np.min, frame=self.intframe)
|
|
|
|
def test_cummin(self):
|
|
self.tsframe.loc[5:10, 0] = nan
|
|
self.tsframe.loc[10:15, 1] = nan
|
|
self.tsframe.loc[15:, 2] = nan
|
|
|
|
# axis = 0
|
|
cummin = self.tsframe.cummin()
|
|
expected = self.tsframe.apply(Series.cummin)
|
|
tm.assert_frame_equal(cummin, expected)
|
|
|
|
# axis = 1
|
|
cummin = self.tsframe.cummin(axis=1)
|
|
expected = self.tsframe.apply(Series.cummin, axis=1)
|
|
tm.assert_frame_equal(cummin, expected)
|
|
|
|
# it works
|
|
df = DataFrame({'A': np.arange(20)}, index=np.arange(20))
|
|
result = df.cummin() # noqa
|
|
|
|
# fix issue
|
|
cummin_xs = self.tsframe.cummin(axis=1)
|
|
assert np.shape(cummin_xs) == np.shape(self.tsframe)
|
|
|
|
def test_cummax(self):
|
|
self.tsframe.loc[5:10, 0] = nan
|
|
self.tsframe.loc[10:15, 1] = nan
|
|
self.tsframe.loc[15:, 2] = nan
|
|
|
|
# axis = 0
|
|
cummax = self.tsframe.cummax()
|
|
expected = self.tsframe.apply(Series.cummax)
|
|
tm.assert_frame_equal(cummax, expected)
|
|
|
|
# axis = 1
|
|
cummax = self.tsframe.cummax(axis=1)
|
|
expected = self.tsframe.apply(Series.cummax, axis=1)
|
|
tm.assert_frame_equal(cummax, expected)
|
|
|
|
# it works
|
|
df = DataFrame({'A': np.arange(20)}, index=np.arange(20))
|
|
result = df.cummax() # noqa
|
|
|
|
# fix issue
|
|
cummax_xs = self.tsframe.cummax(axis=1)
|
|
assert np.shape(cummax_xs) == np.shape(self.tsframe)
|
|
|
|
def test_max(self):
|
|
with warnings.catch_warnings(record=True):
|
|
self._check_stat_op('max', np.max, check_dates=True)
|
|
self._check_stat_op('max', np.max, frame=self.intframe)
|
|
|
|
def test_mad(self):
|
|
f = lambda x: np.abs(x - x.mean()).mean()
|
|
self._check_stat_op('mad', f)
|
|
|
|
def test_var_std(self):
|
|
alt = lambda x: np.var(x, ddof=1)
|
|
self._check_stat_op('var', alt)
|
|
|
|
alt = lambda x: np.std(x, ddof=1)
|
|
self._check_stat_op('std', alt)
|
|
|
|
result = self.tsframe.std(ddof=4)
|
|
expected = self.tsframe.apply(lambda x: x.std(ddof=4))
|
|
tm.assert_almost_equal(result, expected)
|
|
|
|
result = self.tsframe.var(ddof=4)
|
|
expected = self.tsframe.apply(lambda x: x.var(ddof=4))
|
|
tm.assert_almost_equal(result, expected)
|
|
|
|
arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
|
|
result = nanops.nanvar(arr, axis=0)
|
|
assert not (result < 0).any()
|
|
|
|
with pd.option_context('use_bottleneck', False):
|
|
result = nanops.nanvar(arr, axis=0)
|
|
assert not (result < 0).any()
|
|
|
|
@pytest.mark.parametrize(
|
|
"meth", ['sem', 'var', 'std'])
|
|
def test_numeric_only_flag(self, meth):
|
|
# GH #9201
|
|
df1 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz'])
|
|
# set one entry to a number in str format
|
|
df1.loc[0, 'foo'] = '100'
|
|
|
|
df2 = DataFrame(np.random.randn(5, 3), columns=['foo', 'bar', 'baz'])
|
|
# set one entry to a non-number str
|
|
df2.loc[0, 'foo'] = 'a'
|
|
|
|
result = getattr(df1, meth)(axis=1, numeric_only=True)
|
|
expected = getattr(df1[['bar', 'baz']], meth)(axis=1)
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
result = getattr(df2, meth)(axis=1, numeric_only=True)
|
|
expected = getattr(df2[['bar', 'baz']], meth)(axis=1)
|
|
tm.assert_series_equal(expected, result)
|
|
|
|
# df1 has all numbers, df2 has a letter inside
|
|
pytest.raises(TypeError, lambda: getattr(df1, meth)(
|
|
axis=1, numeric_only=False))
|
|
pytest.raises(TypeError, lambda: getattr(df2, meth)(
|
|
axis=1, numeric_only=False))
|
|
|
|
def test_mixed_ops(self):
|
|
# GH 16116
|
|
df = DataFrame({'int': [1, 2, 3, 4],
|
|
'float': [1., 2., 3., 4.],
|
|
'str': ['a', 'b', 'c', 'd']})
|
|
|
|
for op in ['mean', 'std', 'var', 'skew',
|
|
'kurt', 'sem']:
|
|
result = getattr(df, op)()
|
|
assert len(result) == 2
|
|
|
|
with pd.option_context('use_bottleneck', False):
|
|
result = getattr(df, op)()
|
|
assert len(result) == 2
|
|
|
|
def test_cumsum(self):
|
|
self.tsframe.loc[5:10, 0] = nan
|
|
self.tsframe.loc[10:15, 1] = nan
|
|
self.tsframe.loc[15:, 2] = nan
|
|
|
|
# axis = 0
|
|
cumsum = self.tsframe.cumsum()
|
|
expected = self.tsframe.apply(Series.cumsum)
|
|
tm.assert_frame_equal(cumsum, expected)
|
|
|
|
# axis = 1
|
|
cumsum = self.tsframe.cumsum(axis=1)
|
|
expected = self.tsframe.apply(Series.cumsum, axis=1)
|
|
tm.assert_frame_equal(cumsum, expected)
|
|
|
|
# works
|
|
df = DataFrame({'A': np.arange(20)}, index=np.arange(20))
|
|
result = df.cumsum() # noqa
|
|
|
|
# fix issue
|
|
cumsum_xs = self.tsframe.cumsum(axis=1)
|
|
assert np.shape(cumsum_xs) == np.shape(self.tsframe)
|
|
|
|
def test_cumprod(self):
|
|
self.tsframe.loc[5:10, 0] = nan
|
|
self.tsframe.loc[10:15, 1] = nan
|
|
self.tsframe.loc[15:, 2] = nan
|
|
|
|
# axis = 0
|
|
cumprod = self.tsframe.cumprod()
|
|
expected = self.tsframe.apply(Series.cumprod)
|
|
tm.assert_frame_equal(cumprod, expected)
|
|
|
|
# axis = 1
|
|
cumprod = self.tsframe.cumprod(axis=1)
|
|
expected = self.tsframe.apply(Series.cumprod, axis=1)
|
|
tm.assert_frame_equal(cumprod, expected)
|
|
|
|
# fix issue
|
|
cumprod_xs = self.tsframe.cumprod(axis=1)
|
|
assert np.shape(cumprod_xs) == np.shape(self.tsframe)
|
|
|
|
# ints
|
|
df = self.tsframe.fillna(0).astype(int)
|
|
df.cumprod(0)
|
|
df.cumprod(1)
|
|
|
|
# ints32
|
|
df = self.tsframe.fillna(0).astype(np.int32)
|
|
df.cumprod(0)
|
|
df.cumprod(1)
|
|
|
|
def test_sem(self):
|
|
alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x))
|
|
self._check_stat_op('sem', alt)
|
|
|
|
result = self.tsframe.sem(ddof=4)
|
|
expected = self.tsframe.apply(
|
|
lambda x: x.std(ddof=4) / np.sqrt(len(x)))
|
|
tm.assert_almost_equal(result, expected)
|
|
|
|
arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
|
|
result = nanops.nansem(arr, axis=0)
|
|
assert not (result < 0).any()
|
|
|
|
with pd.option_context('use_bottleneck', False):
|
|
result = nanops.nansem(arr, axis=0)
|
|
assert not (result < 0).any()
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_skew(self):
|
|
from scipy.stats import skew
|
|
|
|
def alt(x):
|
|
if len(x) < 3:
|
|
return np.nan
|
|
return skew(x, bias=False)
|
|
|
|
self._check_stat_op('skew', alt)
|
|
|
|
@td.skip_if_no_scipy
|
|
def test_kurt(self):
|
|
from scipy.stats import kurtosis
|
|
|
|
def alt(x):
|
|
if len(x) < 4:
|
|
return np.nan
|
|
return kurtosis(x, bias=False)
|
|
|
|
self._check_stat_op('kurt', alt)
|
|
|
|
index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
|
|
labels=[[0, 0, 0, 0, 0, 0],
|
|
[0, 1, 2, 0, 1, 2],
|
|
[0, 1, 0, 1, 0, 1]])
|
|
df = DataFrame(np.random.randn(6, 3), index=index)
|
|
|
|
kurt = df.kurt()
|
|
kurt2 = df.kurt(level=0).xs('bar')
|
|
tm.assert_series_equal(kurt, kurt2, check_names=False)
|
|
assert kurt.name is None
|
|
assert kurt2.name == 'bar'
|
|
|
|
def _check_stat_op(self, name, alternative, frame=None, has_skipna=True,
|
|
has_numeric_only=False, check_dtype=True,
|
|
check_dates=False, check_less_precise=False,
|
|
skipna_alternative=None):
|
|
if frame is None:
|
|
frame = self.frame
|
|
# set some NAs
|
|
frame.loc[5:10] = np.nan
|
|
frame.loc[15:20, -2:] = np.nan
|
|
|
|
f = getattr(frame, name)
|
|
|
|
if check_dates:
|
|
df = DataFrame({'b': date_range('1/1/2001', periods=2)})
|
|
_f = getattr(df, name)
|
|
result = _f()
|
|
assert isinstance(result, Series)
|
|
|
|
df['a'] = lrange(len(df))
|
|
result = getattr(df, name)()
|
|
assert isinstance(result, Series)
|
|
assert len(result)
|
|
|
|
if has_skipna:
|
|
def wrapper(x):
|
|
return alternative(x.values)
|
|
|
|
skipna_wrapper = tm._make_skipna_wrapper(alternative,
|
|
skipna_alternative)
|
|
result0 = f(axis=0, skipna=False)
|
|
result1 = f(axis=1, skipna=False)
|
|
tm.assert_series_equal(result0, frame.apply(wrapper),
|
|
check_dtype=check_dtype,
|
|
check_less_precise=check_less_precise)
|
|
# HACK: win32
|
|
tm.assert_series_equal(result1, frame.apply(wrapper, axis=1),
|
|
check_dtype=False,
|
|
check_less_precise=check_less_precise)
|
|
else:
|
|
skipna_wrapper = alternative
|
|
wrapper = alternative
|
|
|
|
result0 = f(axis=0)
|
|
result1 = f(axis=1)
|
|
tm.assert_series_equal(result0, frame.apply(skipna_wrapper),
|
|
check_dtype=check_dtype,
|
|
check_less_precise=check_less_precise)
|
|
if name in ['sum', 'prod']:
|
|
exp = frame.apply(skipna_wrapper, axis=1)
|
|
tm.assert_series_equal(result1, exp, check_dtype=False,
|
|
check_less_precise=check_less_precise)
|
|
|
|
# check dtypes
|
|
if check_dtype:
|
|
lcd_dtype = frame.values.dtype
|
|
assert lcd_dtype == result0.dtype
|
|
assert lcd_dtype == result1.dtype
|
|
|
|
# result = f(axis=1)
|
|
# comp = frame.apply(alternative, axis=1).reindex(result.index)
|
|
# assert_series_equal(result, comp)
|
|
|
|
# bad axis
|
|
tm.assert_raises_regex(ValueError, 'No axis named 2', f, axis=2)
|
|
# make sure works on mixed-type frame
|
|
getattr(self.mixed_frame, name)(axis=0)
|
|
getattr(self.mixed_frame, name)(axis=1)
|
|
|
|
if has_numeric_only:
|
|
getattr(self.mixed_frame, name)(axis=0, numeric_only=True)
|
|
getattr(self.mixed_frame, name)(axis=1, numeric_only=True)
|
|
getattr(self.frame, name)(axis=0, numeric_only=False)
|
|
getattr(self.frame, name)(axis=1, numeric_only=False)
|
|
|
|
# all NA case
|
|
if has_skipna:
|
|
all_na = self.frame * np.NaN
|
|
r0 = getattr(all_na, name)(axis=0)
|
|
r1 = getattr(all_na, name)(axis=1)
|
|
if name in ['sum', 'prod']:
|
|
unit = int(name == 'prod')
|
|
expected = pd.Series(unit, index=r0.index, dtype=r0.dtype)
|
|
tm.assert_series_equal(r0, expected)
|
|
expected = pd.Series(unit, index=r1.index, dtype=r1.dtype)
|
|
tm.assert_series_equal(r1, expected)
|
|
|
|
def test_mode(self):
|
|
df = pd.DataFrame({"A": [12, 12, 11, 12, 19, 11],
|
|
"B": [10, 10, 10, np.nan, 3, 4],
|
|
"C": [8, 8, 8, 9, 9, 9],
|
|
"D": np.arange(6, dtype='int64'),
|
|
"E": [8, 8, 1, 1, 3, 3]})
|
|
tm.assert_frame_equal(df[["A"]].mode(),
|
|
pd.DataFrame({"A": [12]}))
|
|
expected = pd.Series([0, 1, 2, 3, 4, 5], dtype='int64', name='D').\
|
|
to_frame()
|
|
tm.assert_frame_equal(df[["D"]].mode(), expected)
|
|
expected = pd.Series([1, 3, 8], dtype='int64', name='E').to_frame()
|
|
tm.assert_frame_equal(df[["E"]].mode(), expected)
|
|
tm.assert_frame_equal(df[["A", "B"]].mode(),
|
|
pd.DataFrame({"A": [12], "B": [10.]}))
|
|
tm.assert_frame_equal(df.mode(),
|
|
pd.DataFrame({"A": [12, np.nan, np.nan, np.nan,
|
|
np.nan, np.nan],
|
|
"B": [10, np.nan, np.nan, np.nan,
|
|
np.nan, np.nan],
|
|
"C": [8, 9, np.nan, np.nan, np.nan,
|
|
np.nan],
|
|
"D": [0, 1, 2, 3, 4, 5],
|
|
"E": [1, 3, 8, np.nan, np.nan,
|
|
np.nan]}))
|
|
|
|
# outputs in sorted order
|
|
df["C"] = list(reversed(df["C"]))
|
|
printing.pprint_thing(df["C"])
|
|
printing.pprint_thing(df["C"].mode())
|
|
a, b = (df[["A", "B", "C"]].mode(),
|
|
pd.DataFrame({"A": [12, np.nan],
|
|
"B": [10, np.nan],
|
|
"C": [8, 9]}))
|
|
printing.pprint_thing(a)
|
|
printing.pprint_thing(b)
|
|
tm.assert_frame_equal(a, b)
|
|
# should work with heterogeneous types
|
|
df = pd.DataFrame({"A": np.arange(6, dtype='int64'),
|
|
"B": pd.date_range('2011', periods=6),
|
|
"C": list('abcdef')})
|
|
exp = pd.DataFrame({"A": pd.Series(np.arange(6, dtype='int64'),
|
|
dtype=df["A"].dtype),
|
|
"B": pd.Series(pd.date_range('2011', periods=6),
|
|
dtype=df["B"].dtype),
|
|
"C": pd.Series(list('abcdef'),
|
|
dtype=df["C"].dtype)})
|
|
tm.assert_frame_equal(df.mode(), exp)
|
|
|
|
def test_operators_timedelta64(self):
|
|
from datetime import timedelta
|
|
df = DataFrame(dict(A=date_range('2012-1-1', periods=3, freq='D'),
|
|
B=date_range('2012-1-2', periods=3, freq='D'),
|
|
C=Timestamp('20120101') -
|
|
timedelta(minutes=5, seconds=5)))
|
|
|
|
diffs = DataFrame(dict(A=df['A'] - df['C'],
|
|
B=df['A'] - df['B']))
|
|
|
|
# min
|
|
result = diffs.min()
|
|
assert result[0] == diffs.loc[0, 'A']
|
|
assert result[1] == diffs.loc[0, 'B']
|
|
|
|
result = diffs.min(axis=1)
|
|
assert (result == diffs.loc[0, 'B']).all()
|
|
|
|
# max
|
|
result = diffs.max()
|
|
assert result[0] == diffs.loc[2, 'A']
|
|
assert result[1] == diffs.loc[2, 'B']
|
|
|
|
result = diffs.max(axis=1)
|
|
assert (result == diffs['A']).all()
|
|
|
|
# abs
|
|
result = diffs.abs()
|
|
result2 = abs(diffs)
|
|
expected = DataFrame(dict(A=df['A'] - df['C'],
|
|
B=df['B'] - df['A']))
|
|
tm.assert_frame_equal(result, expected)
|
|
tm.assert_frame_equal(result2, expected)
|
|
|
|
# mixed frame
|
|
mixed = diffs.copy()
|
|
mixed['C'] = 'foo'
|
|
mixed['D'] = 1
|
|
mixed['E'] = 1.
|
|
mixed['F'] = Timestamp('20130101')
|
|
|
|
# results in an object array
|
|
from pandas.core.tools.timedeltas import (
|
|
_coerce_scalar_to_timedelta_type as _coerce)
|
|
|
|
result = mixed.min()
|
|
expected = Series([_coerce(timedelta(seconds=5 * 60 + 5)),
|
|
_coerce(timedelta(days=-1)),
|
|
'foo', 1, 1.0,
|
|
Timestamp('20130101')],
|
|
index=mixed.columns)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# excludes numeric
|
|
result = mixed.min(axis=1)
|
|
expected = Series([1, 1, 1.], index=[0, 1, 2])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# works when only those columns are selected
|
|
result = mixed[['A', 'B']].min(1)
|
|
expected = Series([timedelta(days=-1)] * 3)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = mixed[['A', 'B']].min()
|
|
expected = Series([timedelta(seconds=5 * 60 + 5),
|
|
timedelta(days=-1)], index=['A', 'B'])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# GH 3106
|
|
df = DataFrame({'time': date_range('20130102', periods=5),
|
|
'time2': date_range('20130105', periods=5)})
|
|
df['off1'] = df['time2'] - df['time']
|
|
assert df['off1'].dtype == 'timedelta64[ns]'
|
|
|
|
df['off2'] = df['time'] - df['time2']
|
|
df._consolidate_inplace()
|
|
assert df['off1'].dtype == 'timedelta64[ns]'
|
|
assert df['off2'].dtype == 'timedelta64[ns]'
|
|
|
|
def test_sum_corner(self):
|
|
axis0 = self.empty.sum(0)
|
|
axis1 = self.empty.sum(1)
|
|
assert isinstance(axis0, Series)
|
|
assert isinstance(axis1, Series)
|
|
assert len(axis0) == 0
|
|
assert len(axis1) == 0
|
|
|
|
@pytest.mark.parametrize('method, unit', [
|
|
('sum', 0),
|
|
('prod', 1),
|
|
])
|
|
def test_sum_prod_nanops(self, method, unit):
|
|
idx = ['a', 'b', 'c']
|
|
df = pd.DataFrame({"a": [unit, unit],
|
|
"b": [unit, np.nan],
|
|
"c": [np.nan, np.nan]})
|
|
# The default
|
|
result = getattr(df, method)
|
|
expected = pd.Series([unit, unit, unit], index=idx, dtype='float64')
|
|
|
|
# min_count=1
|
|
result = getattr(df, method)(min_count=1)
|
|
expected = pd.Series([unit, unit, np.nan], index=idx)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count=0
|
|
result = getattr(df, method)(min_count=0)
|
|
expected = pd.Series([unit, unit, unit], index=idx, dtype='float64')
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = getattr(df.iloc[1:], method)(min_count=1)
|
|
expected = pd.Series([unit, np.nan, np.nan], index=idx)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count > 1
|
|
df = pd.DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5})
|
|
result = getattr(df, method)(min_count=5)
|
|
expected = pd.Series(result, index=['A', 'B'])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = getattr(df, method)(min_count=6)
|
|
expected = pd.Series(result, index=['A', 'B'])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_sum_nanops_timedelta(self):
|
|
# prod isn't defined on timedeltas
|
|
idx = ['a', 'b', 'c']
|
|
df = pd.DataFrame({"a": [0, 0],
|
|
"b": [0, np.nan],
|
|
"c": [np.nan, np.nan]})
|
|
|
|
df2 = df.apply(pd.to_timedelta)
|
|
|
|
# 0 by default
|
|
result = df2.sum()
|
|
expected = pd.Series([0, 0, 0], dtype='m8[ns]', index=idx)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count=0
|
|
result = df2.sum(min_count=0)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# min_count=1
|
|
result = df2.sum(min_count=1)
|
|
expected = pd.Series([0, 0, np.nan], dtype='m8[ns]', index=idx)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_sum_object(self):
|
|
values = self.frame.values.astype(int)
|
|
frame = DataFrame(values, index=self.frame.index,
|
|
columns=self.frame.columns)
|
|
deltas = frame * timedelta(1)
|
|
deltas.sum()
|
|
|
|
def test_sum_bool(self):
|
|
# ensure this works, bug report
|
|
bools = np.isnan(self.frame)
|
|
bools.sum(1)
|
|
bools.sum(0)
|
|
|
|
def test_mean_corner(self):
|
|
# unit test when have object data
|
|
the_mean = self.mixed_frame.mean(axis=0)
|
|
the_sum = self.mixed_frame.sum(axis=0, numeric_only=True)
|
|
tm.assert_index_equal(the_sum.index, the_mean.index)
|
|
assert len(the_mean.index) < len(self.mixed_frame.columns)
|
|
|
|
# xs sum mixed type, just want to know it works...
|
|
the_mean = self.mixed_frame.mean(axis=1)
|
|
the_sum = self.mixed_frame.sum(axis=1, numeric_only=True)
|
|
tm.assert_index_equal(the_sum.index, the_mean.index)
|
|
|
|
# take mean of boolean column
|
|
self.frame['bool'] = self.frame['A'] > 0
|
|
means = self.frame.mean(0)
|
|
assert means['bool'] == self.frame['bool'].values.mean()
|
|
|
|
def test_stats_mixed_type(self):
|
|
# don't blow up
|
|
self.mixed_frame.std(1)
|
|
self.mixed_frame.var(1)
|
|
self.mixed_frame.mean(1)
|
|
self.mixed_frame.skew(1)
|
|
|
|
def test_median_corner(self):
|
|
def wrapper(x):
|
|
if isna(x).any():
|
|
return np.nan
|
|
return np.median(x)
|
|
|
|
self._check_stat_op('median', wrapper, frame=self.intframe,
|
|
check_dtype=False, check_dates=True)
|
|
|
|
# Miscellanea
|
|
|
|
def test_count_objects(self):
|
|
dm = DataFrame(self.mixed_frame._series)
|
|
df = DataFrame(self.mixed_frame._series)
|
|
|
|
tm.assert_series_equal(dm.count(), df.count())
|
|
tm.assert_series_equal(dm.count(1), df.count(1))
|
|
|
|
def test_cumsum_corner(self):
|
|
dm = DataFrame(np.arange(20).reshape(4, 5),
|
|
index=lrange(4), columns=lrange(5))
|
|
# ?(wesm)
|
|
result = dm.cumsum() # noqa
|
|
|
|
def test_sum_bools(self):
|
|
df = DataFrame(index=lrange(1), columns=lrange(10))
|
|
bools = isna(df)
|
|
assert bools.sum(axis=1)[0] == 10
|
|
|
|
# Index of max / min
|
|
|
|
def test_idxmin(self):
|
|
frame = self.frame
|
|
frame.loc[5:10] = np.nan
|
|
frame.loc[15:20, -2:] = np.nan
|
|
for skipna in [True, False]:
|
|
for axis in [0, 1]:
|
|
for df in [frame, self.intframe]:
|
|
result = df.idxmin(axis=axis, skipna=skipna)
|
|
expected = df.apply(Series.idxmin, axis=axis,
|
|
skipna=skipna)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
pytest.raises(ValueError, frame.idxmin, axis=2)
|
|
|
|
def test_idxmax(self):
|
|
frame = self.frame
|
|
frame.loc[5:10] = np.nan
|
|
frame.loc[15:20, -2:] = np.nan
|
|
for skipna in [True, False]:
|
|
for axis in [0, 1]:
|
|
for df in [frame, self.intframe]:
|
|
result = df.idxmax(axis=axis, skipna=skipna)
|
|
expected = df.apply(Series.idxmax, axis=axis,
|
|
skipna=skipna)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
pytest.raises(ValueError, frame.idxmax, axis=2)
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Logical reductions
|
|
|
|
def test_any_all(self):
|
|
self._check_bool_op('any', np.any, has_skipna=True, has_bool_only=True)
|
|
self._check_bool_op('all', np.all, has_skipna=True, has_bool_only=True)
|
|
|
|
def test_any_all_extra(self):
|
|
df = DataFrame({
|
|
'A': [True, False, False],
|
|
'B': [True, True, False],
|
|
'C': [True, True, True],
|
|
}, index=['a', 'b', 'c'])
|
|
result = df[['A', 'B']].any(1)
|
|
expected = Series([True, True, False], index=['a', 'b', 'c'])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df[['A', 'B']].any(1, bool_only=True)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.all(1)
|
|
expected = Series([True, False, False], index=['a', 'b', 'c'])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.all(1, bool_only=True)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# Axis is None
|
|
result = df.all(axis=None).item()
|
|
assert result is False
|
|
|
|
result = df.any(axis=None).item()
|
|
assert result is True
|
|
|
|
result = df[['C']].all(axis=None).item()
|
|
assert result is True
|
|
|
|
# skip pathological failure cases
|
|
# class CantNonzero(object):
|
|
|
|
# def __nonzero__(self):
|
|
# raise ValueError
|
|
|
|
# df[4] = CantNonzero()
|
|
|
|
# it works!
|
|
# df.any(1)
|
|
# df.all(1)
|
|
# df.any(1, bool_only=True)
|
|
# df.all(1, bool_only=True)
|
|
|
|
# df[4][4] = np.nan
|
|
# df.any(1)
|
|
# df.all(1)
|
|
# df.any(1, bool_only=True)
|
|
# df.all(1, bool_only=True)
|
|
|
|
@pytest.mark.parametrize('func, data, expected', [
|
|
(np.any, {}, False),
|
|
(np.all, {}, True),
|
|
(np.any, {'A': []}, False),
|
|
(np.all, {'A': []}, True),
|
|
(np.any, {'A': [False, False]}, False),
|
|
(np.all, {'A': [False, False]}, False),
|
|
(np.any, {'A': [True, False]}, True),
|
|
(np.all, {'A': [True, False]}, False),
|
|
(np.any, {'A': [True, True]}, True),
|
|
(np.all, {'A': [True, True]}, True),
|
|
|
|
(np.any, {'A': [False], 'B': [False]}, False),
|
|
(np.all, {'A': [False], 'B': [False]}, False),
|
|
|
|
(np.any, {'A': [False, False], 'B': [False, True]}, True),
|
|
(np.all, {'A': [False, False], 'B': [False, True]}, False),
|
|
|
|
# other types
|
|
(np.all, {'A': pd.Series([0.0, 1.0], dtype='float')}, False),
|
|
(np.any, {'A': pd.Series([0.0, 1.0], dtype='float')}, True),
|
|
(np.all, {'A': pd.Series([0, 1], dtype=int)}, False),
|
|
(np.any, {'A': pd.Series([0, 1], dtype=int)}, True),
|
|
pytest.param(np.all, {'A': pd.Series([0, 1], dtype='M8[ns]')}, False,
|
|
marks=[td.skip_if_np_lt_115]),
|
|
pytest.param(np.any, {'A': pd.Series([0, 1], dtype='M8[ns]')}, True,
|
|
marks=[td.skip_if_np_lt_115]),
|
|
pytest.param(np.all, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True,
|
|
marks=[td.skip_if_np_lt_115]),
|
|
pytest.param(np.any, {'A': pd.Series([1, 2], dtype='M8[ns]')}, True,
|
|
marks=[td.skip_if_np_lt_115]),
|
|
pytest.param(np.all, {'A': pd.Series([0, 1], dtype='m8[ns]')}, False,
|
|
marks=[td.skip_if_np_lt_115]),
|
|
pytest.param(np.any, {'A': pd.Series([0, 1], dtype='m8[ns]')}, True,
|
|
marks=[td.skip_if_np_lt_115]),
|
|
pytest.param(np.all, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True,
|
|
marks=[td.skip_if_np_lt_115]),
|
|
pytest.param(np.any, {'A': pd.Series([1, 2], dtype='m8[ns]')}, True,
|
|
marks=[td.skip_if_np_lt_115]),
|
|
(np.all, {'A': pd.Series([0, 1], dtype='category')}, False),
|
|
(np.any, {'A': pd.Series([0, 1], dtype='category')}, True),
|
|
(np.all, {'A': pd.Series([1, 2], dtype='category')}, True),
|
|
(np.any, {'A': pd.Series([1, 2], dtype='category')}, True),
|
|
|
|
# # Mix
|
|
# GH-21484
|
|
# (np.all, {'A': pd.Series([10, 20], dtype='M8[ns]'),
|
|
# 'B': pd.Series([10, 20], dtype='m8[ns]')}, True),
|
|
])
|
|
def test_any_all_np_func(self, func, data, expected):
|
|
# https://github.com/pandas-dev/pandas/issues/19976
|
|
data = DataFrame(data)
|
|
result = func(data)
|
|
assert isinstance(result, np.bool_)
|
|
assert result.item() is expected
|
|
|
|
# method version
|
|
result = getattr(DataFrame(data), func.__name__)(axis=None)
|
|
assert isinstance(result, np.bool_)
|
|
assert result.item() is expected
|
|
|
|
def test_any_all_object(self):
|
|
# https://github.com/pandas-dev/pandas/issues/19976
|
|
result = np.all(DataFrame(columns=['a', 'b'])).item()
|
|
assert result is True
|
|
|
|
result = np.any(DataFrame(columns=['a', 'b'])).item()
|
|
assert result is False
|
|
|
|
@pytest.mark.parametrize('method', ['any', 'all'])
|
|
def test_any_all_level_axis_none_raises(self, method):
|
|
df = DataFrame(
|
|
{"A": 1},
|
|
index=MultiIndex.from_product([['A', 'B'], ['a', 'b']],
|
|
names=['out', 'in'])
|
|
)
|
|
xpr = "Must specify 'axis' when aggregating by level."
|
|
with tm.assert_raises_regex(ValueError, xpr):
|
|
getattr(df, method)(axis=None, level='out')
|
|
|
|
def _check_bool_op(self, name, alternative, frame=None, has_skipna=True,
|
|
has_bool_only=False):
|
|
if frame is None:
|
|
frame = self.frame > 0
|
|
# set some NAs
|
|
frame = DataFrame(frame.values.astype(object), frame.index,
|
|
frame.columns)
|
|
frame.loc[5:10] = np.nan
|
|
frame.loc[15:20, -2:] = np.nan
|
|
|
|
f = getattr(frame, name)
|
|
|
|
if has_skipna:
|
|
def skipna_wrapper(x):
|
|
nona = x.dropna().values
|
|
return alternative(nona)
|
|
|
|
def wrapper(x):
|
|
return alternative(x.values)
|
|
|
|
result0 = f(axis=0, skipna=False)
|
|
result1 = f(axis=1, skipna=False)
|
|
tm.assert_series_equal(result0, frame.apply(wrapper))
|
|
tm.assert_series_equal(result1, frame.apply(wrapper, axis=1),
|
|
check_dtype=False) # HACK: win32
|
|
else:
|
|
skipna_wrapper = alternative
|
|
wrapper = alternative
|
|
|
|
result0 = f(axis=0)
|
|
result1 = f(axis=1)
|
|
tm.assert_series_equal(result0, frame.apply(skipna_wrapper))
|
|
tm.assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1),
|
|
check_dtype=False)
|
|
|
|
# result = f(axis=1)
|
|
# comp = frame.apply(alternative, axis=1).reindex(result.index)
|
|
# assert_series_equal(result, comp)
|
|
|
|
# bad axis
|
|
pytest.raises(ValueError, f, axis=2)
|
|
|
|
# make sure works on mixed-type frame
|
|
mixed = self.mixed_frame
|
|
mixed['_bool_'] = np.random.randn(len(mixed)) > 0
|
|
getattr(mixed, name)(axis=0)
|
|
getattr(mixed, name)(axis=1)
|
|
|
|
class NonzeroFail(object):
|
|
|
|
def __nonzero__(self):
|
|
raise ValueError
|
|
|
|
mixed['_nonzero_fail_'] = NonzeroFail()
|
|
|
|
if has_bool_only:
|
|
getattr(mixed, name)(axis=0, bool_only=True)
|
|
getattr(mixed, name)(axis=1, bool_only=True)
|
|
getattr(frame, name)(axis=0, bool_only=False)
|
|
getattr(frame, name)(axis=1, bool_only=False)
|
|
|
|
# all NA case
|
|
if has_skipna:
|
|
all_na = frame * np.NaN
|
|
r0 = getattr(all_na, name)(axis=0)
|
|
r1 = getattr(all_na, name)(axis=1)
|
|
if name == 'any':
|
|
assert not r0.any()
|
|
assert not r1.any()
|
|
else:
|
|
assert r0.all()
|
|
assert r1.all()
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Isin
|
|
|
|
def test_isin(self):
|
|
# GH #4211
|
|
df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
|
|
'ids2': ['a', 'n', 'c', 'n']},
|
|
index=['foo', 'bar', 'baz', 'qux'])
|
|
other = ['a', 'b', 'c']
|
|
|
|
result = df.isin(other)
|
|
expected = DataFrame([df.loc[s].isin(other) for s in df.index])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize("empty", [[], Series(), np.array([])])
|
|
def test_isin_empty(self, empty):
|
|
# see gh-16991
|
|
df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
|
|
expected = DataFrame(False, df.index, df.columns)
|
|
|
|
result = df.isin(empty)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_isin_dict(self):
|
|
df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
|
|
d = {'A': ['a']}
|
|
|
|
expected = DataFrame(False, df.index, df.columns)
|
|
expected.loc[0, 'A'] = True
|
|
|
|
result = df.isin(d)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# non unique columns
|
|
df = DataFrame({'A': ['a', 'b', 'c'], 'B': ['a', 'e', 'f']})
|
|
df.columns = ['A', 'A']
|
|
expected = DataFrame(False, df.index, df.columns)
|
|
expected.loc[0, 'A'] = True
|
|
result = df.isin(d)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_isin_with_string_scalar(self):
|
|
# GH4763
|
|
df = DataFrame({'vals': [1, 2, 3, 4], 'ids': ['a', 'b', 'f', 'n'],
|
|
'ids2': ['a', 'n', 'c', 'n']},
|
|
index=['foo', 'bar', 'baz', 'qux'])
|
|
with pytest.raises(TypeError):
|
|
df.isin('a')
|
|
|
|
with pytest.raises(TypeError):
|
|
df.isin('aaa')
|
|
|
|
def test_isin_df(self):
|
|
df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
|
|
df2 = DataFrame({'A': [0, 2, 12, 4], 'B': [2, np.nan, 4, 5]})
|
|
expected = DataFrame(False, df1.index, df1.columns)
|
|
result = df1.isin(df2)
|
|
expected['A'].loc[[1, 3]] = True
|
|
expected['B'].loc[[0, 2]] = True
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# partial overlapping columns
|
|
df2.columns = ['A', 'C']
|
|
result = df1.isin(df2)
|
|
expected['B'] = False
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_isin_tuples(self):
|
|
# GH16394
|
|
df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
|
|
df['C'] = list(zip(df['A'], df['B']))
|
|
result = df['C'].isin([(1, 'a')])
|
|
tm.assert_series_equal(result,
|
|
Series([True, False, False], name="C"))
|
|
|
|
def test_isin_df_dupe_values(self):
|
|
df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]})
|
|
# just cols duped
|
|
df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
|
|
columns=['B', 'B'])
|
|
with pytest.raises(ValueError):
|
|
df1.isin(df2)
|
|
|
|
# just index duped
|
|
df2 = DataFrame([[0, 2], [12, 4], [2, np.nan], [4, 5]],
|
|
columns=['A', 'B'], index=[0, 0, 1, 1])
|
|
with pytest.raises(ValueError):
|
|
df1.isin(df2)
|
|
|
|
# cols and index:
|
|
df2.columns = ['B', 'B']
|
|
with pytest.raises(ValueError):
|
|
df1.isin(df2)
|
|
|
|
def test_isin_dupe_self(self):
|
|
other = DataFrame({'A': [1, 0, 1, 0], 'B': [1, 1, 0, 0]})
|
|
df = DataFrame([[1, 1], [1, 0], [0, 0]], columns=['A', 'A'])
|
|
result = df.isin(other)
|
|
expected = DataFrame(False, index=df.index, columns=df.columns)
|
|
expected.loc[0] = True
|
|
expected.iloc[1, 1] = True
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_isin_against_series(self):
|
|
df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]},
|
|
index=['a', 'b', 'c', 'd'])
|
|
s = pd.Series([1, 3, 11, 4], index=['a', 'b', 'c', 'd'])
|
|
expected = DataFrame(False, index=df.index, columns=df.columns)
|
|
expected['A'].loc['a'] = True
|
|
expected.loc['d'] = True
|
|
result = df.isin(s)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_isin_multiIndex(self):
|
|
idx = MultiIndex.from_tuples([(0, 'a', 'foo'), (0, 'a', 'bar'),
|
|
(0, 'b', 'bar'), (0, 'b', 'baz'),
|
|
(2, 'a', 'foo'), (2, 'a', 'bar'),
|
|
(2, 'c', 'bar'), (2, 'c', 'baz'),
|
|
(1, 'b', 'foo'), (1, 'b', 'bar'),
|
|
(1, 'c', 'bar'), (1, 'c', 'baz')])
|
|
df1 = DataFrame({'A': np.ones(12),
|
|
'B': np.zeros(12)}, index=idx)
|
|
df2 = DataFrame({'A': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
|
|
'B': [1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1]})
|
|
# against regular index
|
|
expected = DataFrame(False, index=df1.index, columns=df1.columns)
|
|
result = df1.isin(df2)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
df2.index = idx
|
|
expected = df2.values.astype(np.bool)
|
|
expected[:, 1] = ~expected[:, 1]
|
|
expected = DataFrame(expected, columns=['A', 'B'], index=idx)
|
|
|
|
result = df1.isin(df2)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_isin_empty_datetimelike(self):
|
|
# GH 15473
|
|
df1_ts = DataFrame({'date':
|
|
pd.to_datetime(['2014-01-01', '2014-01-02'])})
|
|
df1_td = DataFrame({'date':
|
|
[pd.Timedelta(1, 's'), pd.Timedelta(2, 's')]})
|
|
df2 = DataFrame({'date': []})
|
|
df3 = DataFrame()
|
|
|
|
expected = DataFrame({'date': [False, False]})
|
|
|
|
result = df1_ts.isin(df2)
|
|
tm.assert_frame_equal(result, expected)
|
|
result = df1_ts.isin(df3)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df1_td.isin(df2)
|
|
tm.assert_frame_equal(result, expected)
|
|
result = df1_td.isin(df3)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Row deduplication
|
|
|
|
def test_drop_duplicates(self):
|
|
df = DataFrame({'AAA': ['foo', 'bar', 'foo', 'bar',
|
|
'foo', 'bar', 'bar', 'foo'],
|
|
'B': ['one', 'one', 'two', 'two',
|
|
'two', 'two', 'one', 'two'],
|
|
'C': [1, 1, 2, 2, 2, 2, 1, 2],
|
|
'D': lrange(8)})
|
|
|
|
# single column
|
|
result = df.drop_duplicates('AAA')
|
|
expected = df[:2]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates('AAA', keep='last')
|
|
expected = df.loc[[6, 7]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates('AAA', keep=False)
|
|
expected = df.loc[[]]
|
|
tm.assert_frame_equal(result, expected)
|
|
assert len(result) == 0
|
|
|
|
# multi column
|
|
expected = df.loc[[0, 1, 2, 3]]
|
|
result = df.drop_duplicates(np.array(['AAA', 'B']))
|
|
tm.assert_frame_equal(result, expected)
|
|
result = df.drop_duplicates(['AAA', 'B'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates(('AAA', 'B'), keep='last')
|
|
expected = df.loc[[0, 5, 6, 7]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates(('AAA', 'B'), keep=False)
|
|
expected = df.loc[[0]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# consider everything
|
|
df2 = df.loc[:, ['AAA', 'B', 'C']]
|
|
|
|
result = df2.drop_duplicates()
|
|
# in this case only
|
|
expected = df2.drop_duplicates(['AAA', 'B'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df2.drop_duplicates(keep='last')
|
|
expected = df2.drop_duplicates(['AAA', 'B'], keep='last')
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df2.drop_duplicates(keep=False)
|
|
expected = df2.drop_duplicates(['AAA', 'B'], keep=False)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# integers
|
|
result = df.drop_duplicates('C')
|
|
expected = df.iloc[[0, 2]]
|
|
tm.assert_frame_equal(result, expected)
|
|
result = df.drop_duplicates('C', keep='last')
|
|
expected = df.iloc[[-2, -1]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
df['E'] = df['C'].astype('int8')
|
|
result = df.drop_duplicates('E')
|
|
expected = df.iloc[[0, 2]]
|
|
tm.assert_frame_equal(result, expected)
|
|
result = df.drop_duplicates('E', keep='last')
|
|
expected = df.iloc[[-2, -1]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# GH 11376
|
|
df = pd.DataFrame({'x': [7, 6, 3, 3, 4, 8, 0],
|
|
'y': [0, 6, 5, 5, 9, 1, 2]})
|
|
expected = df.loc[df.index != 3]
|
|
tm.assert_frame_equal(df.drop_duplicates(), expected)
|
|
|
|
df = pd.DataFrame([[1, 0], [0, 2]])
|
|
tm.assert_frame_equal(df.drop_duplicates(), df)
|
|
|
|
df = pd.DataFrame([[-2, 0], [0, -4]])
|
|
tm.assert_frame_equal(df.drop_duplicates(), df)
|
|
|
|
x = np.iinfo(np.int64).max / 3 * 2
|
|
df = pd.DataFrame([[-x, x], [0, x + 4]])
|
|
tm.assert_frame_equal(df.drop_duplicates(), df)
|
|
|
|
df = pd.DataFrame([[-x, x], [x, x + 4]])
|
|
tm.assert_frame_equal(df.drop_duplicates(), df)
|
|
|
|
# GH 11864
|
|
df = pd.DataFrame([i] * 9 for i in range(16))
|
|
df = df.append([[1] + [0] * 8], ignore_index=True)
|
|
|
|
for keep in ['first', 'last', False]:
|
|
assert df.duplicated(keep=keep).sum() == 0
|
|
|
|
@pytest.mark.parametrize('subset', ['a', ['a'], ['a', 'B']])
|
|
def test_duplicated_with_misspelled_column_name(self, subset):
|
|
# GH 19730
|
|
df = pd.DataFrame({'A': [0, 0, 1],
|
|
'B': [0, 0, 1],
|
|
'C': [0, 0, 1]})
|
|
|
|
with pytest.raises(KeyError):
|
|
df.duplicated(subset)
|
|
|
|
with pytest.raises(KeyError):
|
|
df.drop_duplicates(subset)
|
|
|
|
@pytest.mark.slow
|
|
def test_duplicated_do_not_fail_on_wide_dataframes(self):
|
|
# gh-21524
|
|
# Given the wide dataframe with a lot of columns
|
|
# with different (important!) values
|
|
data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
|
|
for i in range(100)}
|
|
df = pd.DataFrame(data).T
|
|
result = df.duplicated()
|
|
|
|
# Then duplicates produce the bool pd.Series as a result
|
|
# and don't fail during calculation.
|
|
# Actual values doesn't matter here, though usually
|
|
# it's all False in this case
|
|
assert isinstance(result, pd.Series)
|
|
assert result.dtype == np.bool
|
|
|
|
def test_drop_duplicates_with_duplicate_column_names(self):
|
|
# GH17836
|
|
df = DataFrame([
|
|
[1, 2, 5],
|
|
[3, 4, 6],
|
|
[3, 4, 7]
|
|
], columns=['a', 'a', 'b'])
|
|
|
|
result0 = df.drop_duplicates()
|
|
tm.assert_frame_equal(result0, df)
|
|
|
|
result1 = df.drop_duplicates('a')
|
|
expected1 = df[:2]
|
|
tm.assert_frame_equal(result1, expected1)
|
|
|
|
def test_drop_duplicates_for_take_all(self):
|
|
df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar',
|
|
'foo', 'bar', 'qux', 'foo'],
|
|
'B': ['one', 'one', 'two', 'two',
|
|
'two', 'two', 'one', 'two'],
|
|
'C': [1, 1, 2, 2, 2, 2, 1, 2],
|
|
'D': lrange(8)})
|
|
|
|
# single column
|
|
result = df.drop_duplicates('AAA')
|
|
expected = df.iloc[[0, 1, 2, 6]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates('AAA', keep='last')
|
|
expected = df.iloc[[2, 5, 6, 7]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates('AAA', keep=False)
|
|
expected = df.iloc[[2, 6]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# multiple columns
|
|
result = df.drop_duplicates(['AAA', 'B'])
|
|
expected = df.iloc[[0, 1, 2, 3, 4, 6]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates(['AAA', 'B'], keep='last')
|
|
expected = df.iloc[[0, 1, 2, 5, 6, 7]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates(['AAA', 'B'], keep=False)
|
|
expected = df.iloc[[0, 1, 2, 6]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_drop_duplicates_tuple(self):
|
|
df = DataFrame({('AA', 'AB'): ['foo', 'bar', 'foo', 'bar',
|
|
'foo', 'bar', 'bar', 'foo'],
|
|
'B': ['one', 'one', 'two', 'two',
|
|
'two', 'two', 'one', 'two'],
|
|
'C': [1, 1, 2, 2, 2, 2, 1, 2],
|
|
'D': lrange(8)})
|
|
|
|
# single column
|
|
result = df.drop_duplicates(('AA', 'AB'))
|
|
expected = df[:2]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates(('AA', 'AB'), keep='last')
|
|
expected = df.loc[[6, 7]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates(('AA', 'AB'), keep=False)
|
|
expected = df.loc[[]] # empty df
|
|
assert len(result) == 0
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# multi column
|
|
expected = df.loc[[0, 1, 2, 3]]
|
|
result = df.drop_duplicates((('AA', 'AB'), 'B'))
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_drop_duplicates_NA(self):
|
|
# none
|
|
df = DataFrame({'A': [None, None, 'foo', 'bar',
|
|
'foo', 'bar', 'bar', 'foo'],
|
|
'B': ['one', 'one', 'two', 'two',
|
|
'two', 'two', 'one', 'two'],
|
|
'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
|
|
'D': lrange(8)})
|
|
|
|
# single column
|
|
result = df.drop_duplicates('A')
|
|
expected = df.loc[[0, 2, 3]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates('A', keep='last')
|
|
expected = df.loc[[1, 6, 7]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates('A', keep=False)
|
|
expected = df.loc[[]] # empty df
|
|
tm.assert_frame_equal(result, expected)
|
|
assert len(result) == 0
|
|
|
|
# multi column
|
|
result = df.drop_duplicates(['A', 'B'])
|
|
expected = df.loc[[0, 2, 3, 6]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates(['A', 'B'], keep='last')
|
|
expected = df.loc[[1, 5, 6, 7]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates(['A', 'B'], keep=False)
|
|
expected = df.loc[[6]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# nan
|
|
df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
|
|
'foo', 'bar', 'bar', 'foo'],
|
|
'B': ['one', 'one', 'two', 'two',
|
|
'two', 'two', 'one', 'two'],
|
|
'C': [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.],
|
|
'D': lrange(8)})
|
|
|
|
# single column
|
|
result = df.drop_duplicates('C')
|
|
expected = df[:2]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates('C', keep='last')
|
|
expected = df.loc[[3, 7]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates('C', keep=False)
|
|
expected = df.loc[[]] # empty df
|
|
tm.assert_frame_equal(result, expected)
|
|
assert len(result) == 0
|
|
|
|
# multi column
|
|
result = df.drop_duplicates(['C', 'B'])
|
|
expected = df.loc[[0, 1, 2, 4]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates(['C', 'B'], keep='last')
|
|
expected = df.loc[[1, 3, 6, 7]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates(['C', 'B'], keep=False)
|
|
expected = df.loc[[1]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_drop_duplicates_NA_for_take_all(self):
|
|
# none
|
|
df = DataFrame({'A': [None, None, 'foo', 'bar',
|
|
'foo', 'baz', 'bar', 'qux'],
|
|
'C': [1.0, np.nan, np.nan, np.nan, 1., 2., 3, 1.]})
|
|
|
|
# single column
|
|
result = df.drop_duplicates('A')
|
|
expected = df.iloc[[0, 2, 3, 5, 7]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates('A', keep='last')
|
|
expected = df.iloc[[1, 4, 5, 6, 7]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates('A', keep=False)
|
|
expected = df.iloc[[5, 7]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# nan
|
|
|
|
# single column
|
|
result = df.drop_duplicates('C')
|
|
expected = df.iloc[[0, 1, 5, 6]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates('C', keep='last')
|
|
expected = df.iloc[[3, 5, 6, 7]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.drop_duplicates('C', keep=False)
|
|
expected = df.iloc[[5, 6]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_drop_duplicates_inplace(self):
|
|
orig = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
|
|
'foo', 'bar', 'bar', 'foo'],
|
|
'B': ['one', 'one', 'two', 'two',
|
|
'two', 'two', 'one', 'two'],
|
|
'C': [1, 1, 2, 2, 2, 2, 1, 2],
|
|
'D': lrange(8)})
|
|
|
|
# single column
|
|
df = orig.copy()
|
|
df.drop_duplicates('A', inplace=True)
|
|
expected = orig[:2]
|
|
result = df
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
df = orig.copy()
|
|
df.drop_duplicates('A', keep='last', inplace=True)
|
|
expected = orig.loc[[6, 7]]
|
|
result = df
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
df = orig.copy()
|
|
df.drop_duplicates('A', keep=False, inplace=True)
|
|
expected = orig.loc[[]]
|
|
result = df
|
|
tm.assert_frame_equal(result, expected)
|
|
assert len(df) == 0
|
|
|
|
# multi column
|
|
df = orig.copy()
|
|
df.drop_duplicates(['A', 'B'], inplace=True)
|
|
expected = orig.loc[[0, 1, 2, 3]]
|
|
result = df
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
df = orig.copy()
|
|
df.drop_duplicates(['A', 'B'], keep='last', inplace=True)
|
|
expected = orig.loc[[0, 5, 6, 7]]
|
|
result = df
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
df = orig.copy()
|
|
df.drop_duplicates(['A', 'B'], keep=False, inplace=True)
|
|
expected = orig.loc[[0]]
|
|
result = df
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# consider everything
|
|
orig2 = orig.loc[:, ['A', 'B', 'C']].copy()
|
|
|
|
df2 = orig2.copy()
|
|
df2.drop_duplicates(inplace=True)
|
|
# in this case only
|
|
expected = orig2.drop_duplicates(['A', 'B'])
|
|
result = df2
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
df2 = orig2.copy()
|
|
df2.drop_duplicates(keep='last', inplace=True)
|
|
expected = orig2.drop_duplicates(['A', 'B'], keep='last')
|
|
result = df2
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
df2 = orig2.copy()
|
|
df2.drop_duplicates(keep=False, inplace=True)
|
|
expected = orig2.drop_duplicates(['A', 'B'], keep=False)
|
|
result = df2
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Rounding
|
|
def test_round(self):
|
|
# GH 2665
|
|
|
|
# Test that rounding an empty DataFrame does nothing
|
|
df = DataFrame()
|
|
tm.assert_frame_equal(df, df.round())
|
|
|
|
# Here's the test frame we'll be working with
|
|
df = DataFrame({'col1': [1.123, 2.123, 3.123],
|
|
'col2': [1.234, 2.234, 3.234]})
|
|
|
|
# Default round to integer (i.e. decimals=0)
|
|
expected_rounded = DataFrame(
|
|
{'col1': [1., 2., 3.], 'col2': [1., 2., 3.]})
|
|
tm.assert_frame_equal(df.round(), expected_rounded)
|
|
|
|
# Round with an integer
|
|
decimals = 2
|
|
expected_rounded = DataFrame({'col1': [1.12, 2.12, 3.12],
|
|
'col2': [1.23, 2.23, 3.23]})
|
|
tm.assert_frame_equal(df.round(decimals), expected_rounded)
|
|
|
|
# This should also work with np.round (since np.round dispatches to
|
|
# df.round)
|
|
tm.assert_frame_equal(np.round(df, decimals), expected_rounded)
|
|
|
|
# Round with a list
|
|
round_list = [1, 2]
|
|
with pytest.raises(TypeError):
|
|
df.round(round_list)
|
|
|
|
# Round with a dictionary
|
|
expected_rounded = DataFrame(
|
|
{'col1': [1.1, 2.1, 3.1], 'col2': [1.23, 2.23, 3.23]})
|
|
round_dict = {'col1': 1, 'col2': 2}
|
|
tm.assert_frame_equal(df.round(round_dict), expected_rounded)
|
|
|
|
# Incomplete dict
|
|
expected_partially_rounded = DataFrame(
|
|
{'col1': [1.123, 2.123, 3.123], 'col2': [1.2, 2.2, 3.2]})
|
|
partial_round_dict = {'col2': 1}
|
|
tm.assert_frame_equal(df.round(partial_round_dict),
|
|
expected_partially_rounded)
|
|
|
|
# Dict with unknown elements
|
|
wrong_round_dict = {'col3': 2, 'col2': 1}
|
|
tm.assert_frame_equal(df.round(wrong_round_dict),
|
|
expected_partially_rounded)
|
|
|
|
# float input to `decimals`
|
|
non_int_round_dict = {'col1': 1, 'col2': 0.5}
|
|
with pytest.raises(TypeError):
|
|
df.round(non_int_round_dict)
|
|
|
|
# String input
|
|
non_int_round_dict = {'col1': 1, 'col2': 'foo'}
|
|
with pytest.raises(TypeError):
|
|
df.round(non_int_round_dict)
|
|
|
|
non_int_round_Series = Series(non_int_round_dict)
|
|
with pytest.raises(TypeError):
|
|
df.round(non_int_round_Series)
|
|
|
|
# List input
|
|
non_int_round_dict = {'col1': 1, 'col2': [1, 2]}
|
|
with pytest.raises(TypeError):
|
|
df.round(non_int_round_dict)
|
|
|
|
non_int_round_Series = Series(non_int_round_dict)
|
|
with pytest.raises(TypeError):
|
|
df.round(non_int_round_Series)
|
|
|
|
# Non integer Series inputs
|
|
non_int_round_Series = Series(non_int_round_dict)
|
|
with pytest.raises(TypeError):
|
|
df.round(non_int_round_Series)
|
|
|
|
non_int_round_Series = Series(non_int_round_dict)
|
|
with pytest.raises(TypeError):
|
|
df.round(non_int_round_Series)
|
|
|
|
# Negative numbers
|
|
negative_round_dict = {'col1': -1, 'col2': -2}
|
|
big_df = df * 100
|
|
expected_neg_rounded = DataFrame(
|
|
{'col1': [110., 210, 310], 'col2': [100., 200, 300]})
|
|
tm.assert_frame_equal(big_df.round(negative_round_dict),
|
|
expected_neg_rounded)
|
|
|
|
# nan in Series round
|
|
nan_round_Series = Series({'col1': nan, 'col2': 1})
|
|
|
|
# TODO(wesm): unused?
|
|
expected_nan_round = DataFrame({ # noqa
|
|
'col1': [1.123, 2.123, 3.123],
|
|
'col2': [1.2, 2.2, 3.2]})
|
|
|
|
with pytest.raises(TypeError):
|
|
df.round(nan_round_Series)
|
|
|
|
# Make sure this doesn't break existing Series.round
|
|
tm.assert_series_equal(df['col1'].round(1), expected_rounded['col1'])
|
|
|
|
# named columns
|
|
# GH 11986
|
|
decimals = 2
|
|
expected_rounded = DataFrame(
|
|
{'col1': [1.12, 2.12, 3.12], 'col2': [1.23, 2.23, 3.23]})
|
|
df.columns.name = "cols"
|
|
expected_rounded.columns.name = "cols"
|
|
tm.assert_frame_equal(df.round(decimals), expected_rounded)
|
|
|
|
# interaction of named columns & series
|
|
tm.assert_series_equal(df['col1'].round(decimals),
|
|
expected_rounded['col1'])
|
|
tm.assert_series_equal(df.round(decimals)['col1'],
|
|
expected_rounded['col1'])
|
|
|
|
def test_numpy_round(self):
|
|
# See gh-12600
|
|
df = DataFrame([[1.53, 1.36], [0.06, 7.01]])
|
|
out = np.round(df, decimals=0)
|
|
expected = DataFrame([[2., 1.], [0., 7.]])
|
|
tm.assert_frame_equal(out, expected)
|
|
|
|
msg = "the 'out' parameter is not supported"
|
|
with tm.assert_raises_regex(ValueError, msg):
|
|
np.round(df, decimals=0, out=df)
|
|
|
|
def test_round_mixed_type(self):
|
|
# GH11885
|
|
df = DataFrame({'col1': [1.1, 2.2, 3.3, 4.4],
|
|
'col2': ['1', 'a', 'c', 'f'],
|
|
'col3': date_range('20111111', periods=4)})
|
|
round_0 = DataFrame({'col1': [1., 2., 3., 4.],
|
|
'col2': ['1', 'a', 'c', 'f'],
|
|
'col3': date_range('20111111', periods=4)})
|
|
tm.assert_frame_equal(df.round(), round_0)
|
|
tm.assert_frame_equal(df.round(1), df)
|
|
tm.assert_frame_equal(df.round({'col1': 1}), df)
|
|
tm.assert_frame_equal(df.round({'col1': 0}), round_0)
|
|
tm.assert_frame_equal(df.round({'col1': 0, 'col2': 1}), round_0)
|
|
tm.assert_frame_equal(df.round({'col3': 1}), df)
|
|
|
|
def test_round_issue(self):
|
|
# GH11611
|
|
|
|
df = pd.DataFrame(np.random.random([3, 3]), columns=['A', 'B', 'C'],
|
|
index=['first', 'second', 'third'])
|
|
|
|
dfs = pd.concat((df, df), axis=1)
|
|
rounded = dfs.round()
|
|
tm.assert_index_equal(rounded.index, dfs.index)
|
|
|
|
decimals = pd.Series([1, 0, 2], index=['A', 'B', 'A'])
|
|
pytest.raises(ValueError, df.round, decimals)
|
|
|
|
def test_built_in_round(self):
|
|
if not compat.PY3:
|
|
pytest.skip("build in round cannot be overridden "
|
|
"prior to Python 3")
|
|
|
|
# GH11763
|
|
# Here's the test frame we'll be working with
|
|
df = DataFrame(
|
|
{'col1': [1.123, 2.123, 3.123], 'col2': [1.234, 2.234, 3.234]})
|
|
|
|
# Default round to integer (i.e. decimals=0)
|
|
expected_rounded = DataFrame(
|
|
{'col1': [1., 2., 3.], 'col2': [1., 2., 3.]})
|
|
tm.assert_frame_equal(round(df), expected_rounded)
|
|
|
|
def test_pct_change(self):
|
|
# GH 11150
|
|
pnl = DataFrame([np.arange(0, 40, 10), np.arange(0, 40, 10), np.arange(
|
|
0, 40, 10)]).astype(np.float64)
|
|
pnl.iat[1, 0] = np.nan
|
|
pnl.iat[1, 1] = np.nan
|
|
pnl.iat[2, 3] = 60
|
|
|
|
for axis in range(2):
|
|
expected = pnl.ffill(axis=axis) / pnl.ffill(axis=axis).shift(
|
|
axis=axis) - 1
|
|
result = pnl.pct_change(axis=axis, fill_method='pad')
|
|
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Clip
|
|
|
|
def test_clip(self):
|
|
median = self.frame.median().median()
|
|
original = self.frame.copy()
|
|
|
|
capped = self.frame.clip_upper(median)
|
|
assert not (capped.values > median).any()
|
|
|
|
floored = self.frame.clip_lower(median)
|
|
assert not (floored.values < median).any()
|
|
|
|
double = self.frame.clip(upper=median, lower=median)
|
|
assert not (double.values != median).any()
|
|
|
|
# Verify that self.frame was not changed inplace
|
|
assert (self.frame.values == original.values).all()
|
|
|
|
def test_inplace_clip(self):
|
|
# GH #15388
|
|
median = self.frame.median().median()
|
|
frame_copy = self.frame.copy()
|
|
|
|
frame_copy.clip_upper(median, inplace=True)
|
|
assert not (frame_copy.values > median).any()
|
|
frame_copy = self.frame.copy()
|
|
|
|
frame_copy.clip_lower(median, inplace=True)
|
|
assert not (frame_copy.values < median).any()
|
|
frame_copy = self.frame.copy()
|
|
|
|
frame_copy.clip(upper=median, lower=median, inplace=True)
|
|
assert not (frame_copy.values != median).any()
|
|
|
|
def test_dataframe_clip(self):
|
|
# GH #2747
|
|
df = DataFrame(np.random.randn(1000, 2))
|
|
|
|
for lb, ub in [(-1, 1), (1, -1)]:
|
|
clipped_df = df.clip(lb, ub)
|
|
|
|
lb, ub = min(lb, ub), max(ub, lb)
|
|
lb_mask = df.values <= lb
|
|
ub_mask = df.values >= ub
|
|
mask = ~lb_mask & ~ub_mask
|
|
assert (clipped_df.values[lb_mask] == lb).all()
|
|
assert (clipped_df.values[ub_mask] == ub).all()
|
|
assert (clipped_df.values[mask] == df.values[mask]).all()
|
|
|
|
def test_clip_mixed_numeric(self):
|
|
# TODO(jreback)
|
|
# clip on mixed integer or floats
|
|
# with integer clippers coerces to float
|
|
df = DataFrame({'A': [1, 2, 3],
|
|
'B': [1., np.nan, 3.]})
|
|
result = df.clip(1, 2)
|
|
expected = DataFrame({'A': [1, 2, 2.],
|
|
'B': [1., np.nan, 2.]})
|
|
tm.assert_frame_equal(result, expected, check_like=True)
|
|
|
|
@pytest.mark.parametrize("inplace", [True, False])
|
|
def test_clip_against_series(self, inplace):
|
|
# GH #6966
|
|
|
|
df = DataFrame(np.random.randn(1000, 2))
|
|
lb = Series(np.random.randn(1000))
|
|
ub = lb + 1
|
|
|
|
original = df.copy()
|
|
clipped_df = df.clip(lb, ub, axis=0, inplace=inplace)
|
|
|
|
if inplace:
|
|
clipped_df = df
|
|
|
|
for i in range(2):
|
|
lb_mask = original.iloc[:, i] <= lb
|
|
ub_mask = original.iloc[:, i] >= ub
|
|
mask = ~lb_mask & ~ub_mask
|
|
|
|
result = clipped_df.loc[lb_mask, i]
|
|
tm.assert_series_equal(result, lb[lb_mask], check_names=False)
|
|
assert result.name == i
|
|
|
|
result = clipped_df.loc[ub_mask, i]
|
|
tm.assert_series_equal(result, ub[ub_mask], check_names=False)
|
|
assert result.name == i
|
|
|
|
tm.assert_series_equal(clipped_df.loc[mask, i], df.loc[mask, i])
|
|
|
|
@pytest.mark.parametrize("inplace", [True, False])
|
|
@pytest.mark.parametrize("lower", [[2, 3, 4], np.asarray([2, 3, 4])])
|
|
@pytest.mark.parametrize("axis,res", [
|
|
(0, [[2., 2., 3.], [4., 5., 6.], [7., 7., 7.]]),
|
|
(1, [[2., 3., 4.], [4., 5., 6.], [5., 6., 7.]])
|
|
])
|
|
def test_clip_against_list_like(self, inplace, lower, axis, res):
|
|
# GH #15390
|
|
original = self.simple.copy(deep=True)
|
|
|
|
result = original.clip(lower=lower, upper=[5, 6, 7],
|
|
axis=axis, inplace=inplace)
|
|
|
|
expected = pd.DataFrame(res,
|
|
columns=original.columns,
|
|
index=original.index)
|
|
if inplace:
|
|
result = original
|
|
tm.assert_frame_equal(result, expected, check_exact=True)
|
|
|
|
@pytest.mark.parametrize("axis", [0, 1, None])
|
|
def test_clip_against_frame(self, axis):
|
|
df = DataFrame(np.random.randn(1000, 2))
|
|
lb = DataFrame(np.random.randn(1000, 2))
|
|
ub = lb + 1
|
|
|
|
clipped_df = df.clip(lb, ub, axis=axis)
|
|
|
|
lb_mask = df <= lb
|
|
ub_mask = df >= ub
|
|
mask = ~lb_mask & ~ub_mask
|
|
|
|
tm.assert_frame_equal(clipped_df[lb_mask], lb[lb_mask])
|
|
tm.assert_frame_equal(clipped_df[ub_mask], ub[ub_mask])
|
|
tm.assert_frame_equal(clipped_df[mask], df[mask])
|
|
|
|
def test_clip_with_na_args(self):
|
|
"""Should process np.nan argument as None """
|
|
# GH # 17276
|
|
tm.assert_frame_equal(self.frame.clip(np.nan), self.frame)
|
|
tm.assert_frame_equal(self.frame.clip(upper=np.nan, lower=np.nan),
|
|
self.frame)
|
|
|
|
# GH #19992
|
|
df = DataFrame({'col_0': [1, 2, 3], 'col_1': [4, 5, 6],
|
|
'col_2': [7, 8, 9]})
|
|
|
|
result = df.clip(lower=[4, 5, np.nan], axis=0)
|
|
expected = DataFrame({'col_0': [4, 5, np.nan], 'col_1': [4, 5, np.nan],
|
|
'col_2': [7, 8, np.nan]})
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.clip(lower=[4, 5, np.nan], axis=1)
|
|
expected = DataFrame({'col_0': [4, 4, 4], 'col_1': [5, 5, 6],
|
|
'col_2': [np.nan, np.nan, np.nan]})
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Matrix-like
|
|
def test_dot(self):
|
|
a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'],
|
|
columns=['p', 'q', 'r', 's'])
|
|
b = DataFrame(np.random.randn(4, 2), index=['p', 'q', 'r', 's'],
|
|
columns=['one', 'two'])
|
|
|
|
result = a.dot(b)
|
|
expected = DataFrame(np.dot(a.values, b.values),
|
|
index=['a', 'b', 'c'],
|
|
columns=['one', 'two'])
|
|
# Check alignment
|
|
b1 = b.reindex(index=reversed(b.index))
|
|
result = a.dot(b)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# Check series argument
|
|
result = a.dot(b['one'])
|
|
tm.assert_series_equal(result, expected['one'], check_names=False)
|
|
assert result.name is None
|
|
|
|
result = a.dot(b1['one'])
|
|
tm.assert_series_equal(result, expected['one'], check_names=False)
|
|
assert result.name is None
|
|
|
|
# can pass correct-length arrays
|
|
row = a.iloc[0].values
|
|
|
|
result = a.dot(row)
|
|
exp = a.dot(a.iloc[0])
|
|
tm.assert_series_equal(result, exp)
|
|
|
|
with tm.assert_raises_regex(ValueError,
|
|
'Dot product shape mismatch'):
|
|
a.dot(row[:-1])
|
|
|
|
a = np.random.rand(1, 5)
|
|
b = np.random.rand(5, 1)
|
|
A = DataFrame(a)
|
|
|
|
# TODO(wesm): unused
|
|
B = DataFrame(b) # noqa
|
|
|
|
# it works
|
|
result = A.dot(b)
|
|
|
|
# unaligned
|
|
df = DataFrame(randn(3, 4), index=[1, 2, 3], columns=lrange(4))
|
|
df2 = DataFrame(randn(5, 3), index=lrange(5), columns=[1, 2, 3])
|
|
|
|
with tm.assert_raises_regex(ValueError, 'aligned'):
|
|
df.dot(df2)
|
|
|
|
@pytest.mark.skipif(not PY35,
|
|
reason='matmul supported for Python>=3.5')
|
|
@pytest.mark.xfail(
|
|
_np_version_under1p12,
|
|
reason="unpredictable return types under numpy < 1.12")
|
|
def test_matmul(self):
|
|
# matmul test is for GH #10259
|
|
a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'],
|
|
columns=['p', 'q', 'r', 's'])
|
|
b = DataFrame(np.random.randn(4, 2), index=['p', 'q', 'r', 's'],
|
|
columns=['one', 'two'])
|
|
|
|
# DataFrame @ DataFrame
|
|
result = operator.matmul(a, b)
|
|
expected = DataFrame(np.dot(a.values, b.values),
|
|
index=['a', 'b', 'c'],
|
|
columns=['one', 'two'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# DataFrame @ Series
|
|
result = operator.matmul(a, b.one)
|
|
expected = Series(np.dot(a.values, b.one.values),
|
|
index=['a', 'b', 'c'])
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# np.array @ DataFrame
|
|
result = operator.matmul(a.values, b)
|
|
expected = np.dot(a.values, b.values)
|
|
tm.assert_almost_equal(result, expected)
|
|
|
|
# nested list @ DataFrame (__rmatmul__)
|
|
result = operator.matmul(a.values.tolist(), b)
|
|
expected = DataFrame(np.dot(a.values, b.values),
|
|
index=['a', 'b', 'c'],
|
|
columns=['one', 'two'])
|
|
tm.assert_almost_equal(result.values, expected.values)
|
|
|
|
# mixed dtype DataFrame @ DataFrame
|
|
a['q'] = a.q.round().astype(int)
|
|
result = operator.matmul(a, b)
|
|
expected = DataFrame(np.dot(a.values, b.values),
|
|
index=['a', 'b', 'c'],
|
|
columns=['one', 'two'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# different dtypes DataFrame @ DataFrame
|
|
a = a.astype(int)
|
|
result = operator.matmul(a, b)
|
|
expected = DataFrame(np.dot(a.values, b.values),
|
|
index=['a', 'b', 'c'],
|
|
columns=['one', 'two'])
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
# unaligned
|
|
df = DataFrame(randn(3, 4), index=[1, 2, 3], columns=lrange(4))
|
|
df2 = DataFrame(randn(5, 3), index=lrange(5), columns=[1, 2, 3])
|
|
|
|
with tm.assert_raises_regex(ValueError, 'aligned'):
|
|
operator.matmul(df, df2)
|
|
|
|
|
|
@pytest.fixture
|
|
def df_duplicates():
|
|
return pd.DataFrame({'a': [1, 2, 3, 4, 4],
|
|
'b': [1, 1, 1, 1, 1],
|
|
'c': [0, 1, 2, 5, 4]},
|
|
index=[0, 0, 1, 1, 1])
|
|
|
|
|
|
@pytest.fixture
|
|
def df_strings():
|
|
return pd.DataFrame({'a': np.random.permutation(10),
|
|
'b': list(ascii_lowercase[:10]),
|
|
'c': np.random.permutation(10).astype('float64')})
|
|
|
|
|
|
@pytest.fixture
|
|
def df_main_dtypes():
|
|
return pd.DataFrame(
|
|
{'group': [1, 1, 2],
|
|
'int': [1, 2, 3],
|
|
'float': [4., 5., 6.],
|
|
'string': list('abc'),
|
|
'category_string': pd.Series(list('abc')).astype('category'),
|
|
'category_int': [7, 8, 9],
|
|
'datetime': pd.date_range('20130101', periods=3),
|
|
'datetimetz': pd.date_range('20130101',
|
|
periods=3,
|
|
tz='US/Eastern'),
|
|
'timedelta': pd.timedelta_range('1 s', periods=3, freq='s')},
|
|
columns=['group', 'int', 'float', 'string',
|
|
'category_string', 'category_int',
|
|
'datetime', 'datetimetz',
|
|
'timedelta'])
|
|
|
|
|
|
class TestNLargestNSmallest(object):
|
|
|
|
dtype_error_msg_template = ("Column {column!r} has dtype {dtype}, cannot "
|
|
"use method {method!r} with this dtype")
|
|
|
|
# ----------------------------------------------------------------------
|
|
# Top / bottom
|
|
@pytest.mark.parametrize('order', [
|
|
['a'],
|
|
['c'],
|
|
['a', 'b'],
|
|
['a', 'c'],
|
|
['b', 'a'],
|
|
['b', 'c'],
|
|
['a', 'b', 'c'],
|
|
['c', 'a', 'b'],
|
|
['c', 'b', 'a'],
|
|
['b', 'c', 'a'],
|
|
['b', 'a', 'c'],
|
|
|
|
# dups!
|
|
['b', 'c', 'c']])
|
|
@pytest.mark.parametrize('n', range(1, 11))
|
|
def test_n(self, df_strings, nselect_method, n, order):
|
|
# GH10393
|
|
df = df_strings
|
|
if 'b' in order:
|
|
|
|
error_msg = self.dtype_error_msg_template.format(
|
|
column='b', method=nselect_method, dtype='object')
|
|
with tm.assert_raises_regex(TypeError, error_msg):
|
|
getattr(df, nselect_method)(n, order)
|
|
else:
|
|
ascending = nselect_method == 'nsmallest'
|
|
result = getattr(df, nselect_method)(n, order)
|
|
expected = df.sort_values(order, ascending=ascending).head(n)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize('columns', [
|
|
('group', 'category_string'), ('group', 'string')])
|
|
def test_n_error(self, df_main_dtypes, nselect_method, columns):
|
|
df = df_main_dtypes
|
|
col = columns[1]
|
|
error_msg = self.dtype_error_msg_template.format(
|
|
column=col, method=nselect_method, dtype=df[col].dtype)
|
|
# escape some characters that may be in the repr
|
|
error_msg = (error_msg.replace('(', '\\(').replace(")", "\\)")
|
|
.replace("[", "\\[").replace("]", "\\]"))
|
|
with tm.assert_raises_regex(TypeError, error_msg):
|
|
getattr(df, nselect_method)(2, columns)
|
|
|
|
def test_n_all_dtypes(self, df_main_dtypes):
|
|
df = df_main_dtypes
|
|
df.nsmallest(2, list(set(df) - {'category_string', 'string'}))
|
|
df.nlargest(2, list(set(df) - {'category_string', 'string'}))
|
|
|
|
def test_n_identical_values(self):
|
|
# GH15297
|
|
df = pd.DataFrame({'a': [1] * 5, 'b': [1, 2, 3, 4, 5]})
|
|
|
|
result = df.nlargest(3, 'a')
|
|
expected = pd.DataFrame(
|
|
{'a': [1] * 3, 'b': [1, 2, 3]}, index=[0, 1, 2]
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.nsmallest(3, 'a')
|
|
expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]})
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
@pytest.mark.parametrize('order', [
|
|
['a', 'b', 'c'],
|
|
['c', 'b', 'a'],
|
|
['a'],
|
|
['b'],
|
|
['a', 'b'],
|
|
['c', 'b']])
|
|
@pytest.mark.parametrize('n', range(1, 6))
|
|
def test_n_duplicate_index(self, df_duplicates, n, order):
|
|
# GH 13412
|
|
|
|
df = df_duplicates
|
|
result = df.nsmallest(n, order)
|
|
expected = df.sort_values(order).head(n)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.nlargest(n, order)
|
|
expected = df.sort_values(order, ascending=False).head(n)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_series_broadcasting(self):
|
|
# smoke test for numpy warnings
|
|
# GH 16378, GH 16306
|
|
df = DataFrame([1.0, 1.0, 1.0])
|
|
df_nan = DataFrame({'A': [np.nan, 2.0, np.nan]})
|
|
s = Series([1, 1, 1])
|
|
s_nan = Series([np.nan, np.nan, 1])
|
|
|
|
with tm.assert_produces_warning(None):
|
|
df_nan.clip_lower(s, axis=0)
|
|
for op in ['lt', 'le', 'gt', 'ge', 'eq', 'ne']:
|
|
getattr(df, op)(s_nan, axis=0)
|
|
|
|
def test_series_nat_conversion(self):
|
|
# GH 18521
|
|
# Check rank does not mutate DataFrame
|
|
df = DataFrame(np.random.randn(10, 3), dtype='float64')
|
|
expected = df.copy()
|
|
df.rank()
|
|
result = df
|
|
tm.assert_frame_equal(result, expected)
|