393 lines
16 KiB
Python
393 lines
16 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
from __future__ import print_function
|
|
|
|
|
|
import pytest
|
|
import numpy as np
|
|
|
|
from pandas import (DataFrame, Series, Timestamp, _np_version_under1p11)
|
|
import pandas as pd
|
|
|
|
from pandas.util.testing import assert_series_equal, assert_frame_equal
|
|
|
|
import pandas.util.testing as tm
|
|
|
|
from pandas.tests.frame.common import TestData
|
|
|
|
|
|
class TestDataFrameQuantile(TestData):
|
|
|
|
def test_quantile(self):
|
|
from numpy import percentile
|
|
|
|
q = self.tsframe.quantile(0.1, axis=0)
|
|
assert q['A'] == percentile(self.tsframe['A'], 10)
|
|
tm.assert_index_equal(q.index, self.tsframe.columns)
|
|
|
|
q = self.tsframe.quantile(0.9, axis=1)
|
|
assert (q['2000-01-17'] ==
|
|
percentile(self.tsframe.loc['2000-01-17'], 90))
|
|
tm.assert_index_equal(q.index, self.tsframe.index)
|
|
|
|
# test degenerate case
|
|
q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0)
|
|
assert(np.isnan(q['x']) and np.isnan(q['y']))
|
|
|
|
# non-numeric exclusion
|
|
df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]})
|
|
rs = df.quantile(0.5)
|
|
xp = df.median().rename(0.5)
|
|
assert_series_equal(rs, xp)
|
|
|
|
# axis
|
|
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
|
result = df.quantile(.5, axis=1)
|
|
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
|
|
assert_series_equal(result, expected)
|
|
|
|
result = df.quantile([.5, .75], axis=1)
|
|
expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75],
|
|
3: [3.5, 3.75]}, index=[0.5, 0.75])
|
|
assert_frame_equal(result, expected, check_index_type=True)
|
|
|
|
# We may want to break API in the future to change this
|
|
# so that we exclude non-numeric along the same axis
|
|
# See GH #7312
|
|
df = DataFrame([[1, 2, 3],
|
|
['a', 'b', 4]])
|
|
result = df.quantile(.5, axis=1)
|
|
expected = Series([3., 4.], index=[0, 1], name=0.5)
|
|
assert_series_equal(result, expected)
|
|
|
|
def test_quantile_axis_mixed(self):
|
|
|
|
# mixed on axis=1
|
|
df = DataFrame({"A": [1, 2, 3],
|
|
"B": [2., 3., 4.],
|
|
"C": pd.date_range('20130101', periods=3),
|
|
"D": ['foo', 'bar', 'baz']})
|
|
result = df.quantile(.5, axis=1)
|
|
expected = Series([1.5, 2.5, 3.5], name=0.5)
|
|
assert_series_equal(result, expected)
|
|
|
|
# must raise
|
|
def f():
|
|
df.quantile(.5, axis=1, numeric_only=False)
|
|
pytest.raises(TypeError, f)
|
|
|
|
def test_quantile_axis_parameter(self):
|
|
# GH 9543/9544
|
|
|
|
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
|
|
|
result = df.quantile(.5, axis=0)
|
|
|
|
expected = Series([2., 3.], index=["A", "B"], name=0.5)
|
|
assert_series_equal(result, expected)
|
|
|
|
expected = df.quantile(.5, axis="index")
|
|
assert_series_equal(result, expected)
|
|
|
|
result = df.quantile(.5, axis=1)
|
|
|
|
expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
|
|
assert_series_equal(result, expected)
|
|
|
|
result = df.quantile(.5, axis="columns")
|
|
assert_series_equal(result, expected)
|
|
|
|
pytest.raises(ValueError, df.quantile, 0.1, axis=-1)
|
|
pytest.raises(ValueError, df.quantile, 0.1, axis="column")
|
|
|
|
def test_quantile_interpolation(self):
|
|
# see gh-10174
|
|
from numpy import percentile
|
|
|
|
# interpolation = linear (default case)
|
|
q = self.tsframe.quantile(0.1, axis=0, interpolation='linear')
|
|
assert q['A'] == percentile(self.tsframe['A'], 10)
|
|
q = self.intframe.quantile(0.1)
|
|
assert q['A'] == percentile(self.intframe['A'], 10)
|
|
|
|
# test with and without interpolation keyword
|
|
q1 = self.intframe.quantile(0.1)
|
|
assert q1['A'] == np.percentile(self.intframe['A'], 10)
|
|
tm.assert_series_equal(q, q1)
|
|
|
|
# interpolation method other than default linear
|
|
df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
|
|
result = df.quantile(.5, axis=1, interpolation='nearest')
|
|
expected = Series([1, 2, 3], index=[1, 2, 3], name=0.5)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# cross-check interpolation=nearest results in original dtype
|
|
exp = np.percentile(np.array([[1, 2, 3], [2, 3, 4]]), .5,
|
|
axis=0, interpolation='nearest')
|
|
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='int64')
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# float
|
|
df = DataFrame({"A": [1., 2., 3.], "B": [2., 3., 4.]}, index=[1, 2, 3])
|
|
result = df.quantile(.5, axis=1, interpolation='nearest')
|
|
expected = Series([1., 2., 3.], index=[1, 2, 3], name=0.5)
|
|
tm.assert_series_equal(result, expected)
|
|
exp = np.percentile(np.array([[1., 2., 3.], [2., 3., 4.]]), .5,
|
|
axis=0, interpolation='nearest')
|
|
expected = Series(exp, index=[1, 2, 3], name=0.5, dtype='float64')
|
|
assert_series_equal(result, expected)
|
|
|
|
# axis
|
|
result = df.quantile([.5, .75], axis=1, interpolation='lower')
|
|
expected = DataFrame({1: [1., 1.], 2: [2., 2.],
|
|
3: [3., 3.]}, index=[0.5, 0.75])
|
|
assert_frame_equal(result, expected)
|
|
|
|
# test degenerate case
|
|
df = DataFrame({'x': [], 'y': []})
|
|
q = df.quantile(0.1, axis=0, interpolation='higher')
|
|
assert(np.isnan(q['x']) and np.isnan(q['y']))
|
|
|
|
# multi
|
|
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
|
|
columns=['a', 'b', 'c'])
|
|
result = df.quantile([.25, .5], interpolation='midpoint')
|
|
|
|
# https://github.com/numpy/numpy/issues/7163
|
|
if _np_version_under1p11:
|
|
expected = DataFrame([[1.5, 1.5, 1.5], [2.5, 2.5, 2.5]],
|
|
index=[.25, .5], columns=['a', 'b', 'c'])
|
|
else:
|
|
expected = DataFrame([[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]],
|
|
index=[.25, .5], columns=['a', 'b', 'c'])
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_quantile_multi(self):
|
|
df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
|
|
columns=['a', 'b', 'c'])
|
|
result = df.quantile([.25, .5])
|
|
expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
|
|
index=[.25, .5], columns=['a', 'b', 'c'])
|
|
assert_frame_equal(result, expected)
|
|
|
|
# axis = 1
|
|
result = df.quantile([.25, .5], axis=1)
|
|
expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]],
|
|
index=[.25, .5], columns=[0, 1, 2])
|
|
|
|
# empty
|
|
result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0)
|
|
expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]},
|
|
index=[.1, .9])
|
|
assert_frame_equal(result, expected)
|
|
|
|
def test_quantile_datetime(self):
|
|
df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]})
|
|
|
|
# exclude datetime
|
|
result = df.quantile(.5)
|
|
expected = Series([2.5], index=['b'])
|
|
|
|
# datetime
|
|
result = df.quantile(.5, numeric_only=False)
|
|
expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5],
|
|
index=['a', 'b'],
|
|
name=0.5)
|
|
assert_series_equal(result, expected)
|
|
|
|
# datetime w/ multi
|
|
result = df.quantile([.5], numeric_only=False)
|
|
expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]],
|
|
index=[.5], columns=['a', 'b'])
|
|
assert_frame_equal(result, expected)
|
|
|
|
# axis = 1
|
|
df['c'] = pd.to_datetime(['2011', '2012'])
|
|
result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False)
|
|
expected = Series([Timestamp('2010-07-02 12:00:00'),
|
|
Timestamp('2011-07-02 12:00:00')],
|
|
index=[0, 1],
|
|
name=0.5)
|
|
assert_series_equal(result, expected)
|
|
|
|
result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False)
|
|
expected = DataFrame([[Timestamp('2010-07-02 12:00:00'),
|
|
Timestamp('2011-07-02 12:00:00')]],
|
|
index=[0.5], columns=[0, 1])
|
|
assert_frame_equal(result, expected)
|
|
|
|
# empty when numeric_only=True
|
|
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
|
|
# result = df[['a', 'c']].quantile(.5)
|
|
# result = df[['a', 'c']].quantile([.5])
|
|
|
|
def test_quantile_invalid(self):
|
|
msg = 'percentiles should all be in the interval \\[0, 1\\]'
|
|
for invalid in [-1, 2, [0.5, -1], [0.5, 2]]:
|
|
with tm.assert_raises_regex(ValueError, msg):
|
|
self.tsframe.quantile(invalid)
|
|
|
|
def test_quantile_box(self):
|
|
df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
|
|
pd.Timestamp('2011-01-02'),
|
|
pd.Timestamp('2011-01-03')],
|
|
'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
|
|
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
|
pd.Timestamp('2011-01-03', tz='US/Eastern')],
|
|
'C': [pd.Timedelta('1 days'),
|
|
pd.Timedelta('2 days'),
|
|
pd.Timedelta('3 days')]})
|
|
|
|
res = df.quantile(0.5, numeric_only=False)
|
|
|
|
exp = pd.Series([pd.Timestamp('2011-01-02'),
|
|
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
|
pd.Timedelta('2 days')],
|
|
name=0.5, index=['A', 'B', 'C'])
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5], numeric_only=False)
|
|
exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
|
|
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
|
pd.Timedelta('2 days')]],
|
|
index=[0.5], columns=['A', 'B', 'C'])
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
# DatetimeBlock may be consolidated and contain NaT in different loc
|
|
df = DataFrame({'A': [pd.Timestamp('2011-01-01'),
|
|
pd.NaT,
|
|
pd.Timestamp('2011-01-02'),
|
|
pd.Timestamp('2011-01-03')],
|
|
'a': [pd.Timestamp('2011-01-01'),
|
|
pd.Timestamp('2011-01-02'),
|
|
pd.NaT,
|
|
pd.Timestamp('2011-01-03')],
|
|
'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
|
|
pd.NaT,
|
|
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
|
pd.Timestamp('2011-01-03', tz='US/Eastern')],
|
|
'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'),
|
|
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
|
pd.NaT,
|
|
pd.Timestamp('2011-01-03', tz='US/Eastern')],
|
|
'C': [pd.Timedelta('1 days'),
|
|
pd.Timedelta('2 days'),
|
|
pd.Timedelta('3 days'),
|
|
pd.NaT],
|
|
'c': [pd.NaT,
|
|
pd.Timedelta('1 days'),
|
|
pd.Timedelta('2 days'),
|
|
pd.Timedelta('3 days')]},
|
|
columns=list('AaBbCc'))
|
|
|
|
res = df.quantile(0.5, numeric_only=False)
|
|
exp = pd.Series([pd.Timestamp('2011-01-02'),
|
|
pd.Timestamp('2011-01-02'),
|
|
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
|
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
|
pd.Timedelta('2 days'),
|
|
pd.Timedelta('2 days')],
|
|
name=0.5, index=list('AaBbCc'))
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5], numeric_only=False)
|
|
exp = pd.DataFrame([[pd.Timestamp('2011-01-02'),
|
|
pd.Timestamp('2011-01-02'),
|
|
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
|
pd.Timestamp('2011-01-02', tz='US/Eastern'),
|
|
pd.Timedelta('2 days'),
|
|
pd.Timedelta('2 days')]],
|
|
index=[0.5], columns=list('AaBbCc'))
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
def test_quantile_nan(self):
|
|
|
|
# GH 14357 - float block where some cols have missing values
|
|
df = DataFrame({'a': np.arange(1, 6.0), 'b': np.arange(1, 6.0)})
|
|
df.iloc[-1, 1] = np.nan
|
|
|
|
res = df.quantile(0.5)
|
|
exp = Series([3.0, 2.5], index=['a', 'b'], name=0.5)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5, 0.75])
|
|
exp = DataFrame({'a': [3.0, 4.0], 'b': [2.5, 3.25]}, index=[0.5, 0.75])
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
res = df.quantile(0.5, axis=1)
|
|
exp = Series(np.arange(1.0, 6.0), name=0.5)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5, 0.75], axis=1)
|
|
exp = DataFrame([np.arange(1.0, 6.0)] * 2, index=[0.5, 0.75])
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
# full-nan column
|
|
df['b'] = np.nan
|
|
|
|
res = df.quantile(0.5)
|
|
exp = Series([3.0, np.nan], index=['a', 'b'], name=0.5)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5, 0.75])
|
|
exp = DataFrame({'a': [3.0, 4.0], 'b': [np.nan, np.nan]},
|
|
index=[0.5, 0.75])
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
def test_quantile_nat(self):
|
|
|
|
# full NaT column
|
|
df = DataFrame({'a': [pd.NaT, pd.NaT, pd.NaT]})
|
|
|
|
res = df.quantile(0.5, numeric_only=False)
|
|
exp = Series([pd.NaT], index=['a'], name=0.5)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5], numeric_only=False)
|
|
exp = DataFrame({'a': [pd.NaT]}, index=[0.5])
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
# mixed non-null / full null column
|
|
df = DataFrame({'a': [pd.Timestamp('2012-01-01'),
|
|
pd.Timestamp('2012-01-02'),
|
|
pd.Timestamp('2012-01-03')],
|
|
'b': [pd.NaT, pd.NaT, pd.NaT]})
|
|
|
|
res = df.quantile(0.5, numeric_only=False)
|
|
exp = Series([pd.Timestamp('2012-01-02'), pd.NaT], index=['a', 'b'],
|
|
name=0.5)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5], numeric_only=False)
|
|
exp = DataFrame([[pd.Timestamp('2012-01-02'), pd.NaT]], index=[0.5],
|
|
columns=['a', 'b'])
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
def test_quantile_empty(self):
|
|
|
|
# floats
|
|
df = DataFrame(columns=['a', 'b'], dtype='float64')
|
|
|
|
res = df.quantile(0.5)
|
|
exp = Series([np.nan, np.nan], index=['a', 'b'], name=0.5)
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
res = df.quantile([0.5])
|
|
exp = DataFrame([[np.nan, np.nan]], columns=['a', 'b'], index=[0.5])
|
|
tm.assert_frame_equal(res, exp)
|
|
|
|
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
|
|
# res = df.quantile(0.5, axis=1)
|
|
# res = df.quantile([0.5], axis=1)
|
|
|
|
# ints
|
|
df = DataFrame(columns=['a', 'b'], dtype='int64')
|
|
|
|
# FIXME (gives empty frame in 0.18.1, broken in 0.19.0)
|
|
# res = df.quantile(0.5)
|
|
|
|
# datetimes
|
|
df = DataFrame(columns=['a', 'b'], dtype='datetime64[ns]')
|
|
|
|
# FIXME (gives NaNs instead of NaT in 0.18.1 or 0.19.0)
|
|
# res = df.quantile(0.5, numeric_only=False)
|