laywerrobot/lib/python3.6/site-packages/pandas/tests/reshape/test_tile.py
2020-08-27 21:55:39 +02:00

607 lines
22 KiB
Python

import os
import pytest
import numpy as np
from pandas.compat import zip
import pandas as pd
from pandas import (DataFrame, Series, isna, to_datetime, DatetimeIndex, Index,
Timestamp, Interval, IntervalIndex, Categorical,
cut, qcut, date_range, NaT, TimedeltaIndex)
from pandas.tseries.offsets import Nano, Day
import pandas.util.testing as tm
from pandas.api.types import CategoricalDtype as CDT
from pandas.core.algorithms import quantile
import pandas.core.reshape.tile as tmod
class TestCut(object):
def test_simple(self):
data = np.ones(5, dtype='int64')
result = cut(data, 4, labels=False)
expected = np.array([1, 1, 1, 1, 1])
tm.assert_numpy_array_equal(result, expected,
check_dtype=False)
def test_bins(self):
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1])
result, bins = cut(data, 3, retbins=True)
intervals = IntervalIndex.from_breaks(bins.round(3))
intervals = intervals.take([0, 0, 0, 1, 2, 0])
expected = Categorical(intervals, ordered=True)
tm.assert_categorical_equal(result, expected)
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
6.53333333, 9.7]))
def test_right(self):
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
result, bins = cut(data, 4, right=True, retbins=True)
intervals = IntervalIndex.from_breaks(bins.round(3))
expected = Categorical(intervals, ordered=True)
expected = expected.take([0, 0, 0, 2, 3, 0, 0])
tm.assert_categorical_equal(result, expected)
tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95,
7.325, 9.7]))
def test_noright(self):
data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
result, bins = cut(data, 4, right=False, retbins=True)
intervals = IntervalIndex.from_breaks(bins.round(3), closed='left')
intervals = intervals.take([0, 0, 0, 2, 3, 0, 1])
expected = Categorical(intervals, ordered=True)
tm.assert_categorical_equal(result, expected)
tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95,
7.325, 9.7095]))
def test_arraylike(self):
data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
result, bins = cut(data, 3, retbins=True)
intervals = IntervalIndex.from_breaks(bins.round(3))
intervals = intervals.take([0, 0, 0, 1, 2, 0])
expected = Categorical(intervals, ordered=True)
tm.assert_categorical_equal(result, expected)
tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667,
6.53333333, 9.7]))
def test_bins_from_intervalindex(self):
c = cut(range(5), 3)
expected = c
result = cut(range(5), bins=expected.categories)
tm.assert_categorical_equal(result, expected)
expected = Categorical.from_codes(np.append(c.codes, -1),
categories=c.categories,
ordered=True)
result = cut(range(6), bins=expected.categories)
tm.assert_categorical_equal(result, expected)
# doc example
# make sure we preserve the bins
ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60])
c = cut(ages, bins=[0, 18, 35, 70])
expected = IntervalIndex.from_tuples([(0, 18), (18, 35), (35, 70)])
tm.assert_index_equal(c.categories, expected)
result = cut([25, 20, 50], bins=c.categories)
tm.assert_index_equal(result.categories, expected)
tm.assert_numpy_array_equal(result.codes,
np.array([1, 1, 2], dtype='int8'))
def test_bins_not_monotonic(self):
data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
pytest.raises(ValueError, cut, data, [0.1, 1.5, 1, 10])
def test_wrong_num_labels(self):
data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
pytest.raises(ValueError, cut, data, [0, 1, 10],
labels=['foo', 'bar', 'baz'])
def test_cut_corner(self):
# h3h
pytest.raises(ValueError, cut, [], 2)
pytest.raises(ValueError, cut, [1, 2, 3], 0.5)
@pytest.mark.parametrize('arg', [2, np.eye(2), DataFrame(np.eye(2))])
@pytest.mark.parametrize('cut_func', [cut, qcut])
def test_cut_not_1d_arg(self, arg, cut_func):
with pytest.raises(ValueError):
cut_func(arg, 2)
def test_cut_out_of_range_more(self):
# #1511
s = Series([0, -1, 0, 1, -3], name='x')
ind = cut(s, [0, 1], labels=False)
exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name='x')
tm.assert_series_equal(ind, exp)
def test_labels(self):
arr = np.tile(np.arange(0, 1.01, 0.1), 4)
result, bins = cut(arr, 4, retbins=True)
ex_levels = IntervalIndex.from_breaks([-1e-3, 0.25, 0.5, 0.75, 1])
tm.assert_index_equal(result.categories, ex_levels)
result, bins = cut(arr, 4, retbins=True, right=False)
ex_levels = IntervalIndex.from_breaks([0, 0.25, 0.5, 0.75, 1 + 1e-3],
closed='left')
tm.assert_index_equal(result.categories, ex_levels)
def test_cut_pass_series_name_to_factor(self):
s = Series(np.random.randn(100), name='foo')
factor = cut(s, 4)
assert factor.name == 'foo'
def test_label_precision(self):
arr = np.arange(0, 0.73, 0.01)
result = cut(arr, 4, precision=2)
ex_levels = IntervalIndex.from_breaks([-0.00072, 0.18, 0.36,
0.54, 0.72])
tm.assert_index_equal(result.categories, ex_levels)
def test_na_handling(self):
arr = np.arange(0, 0.75, 0.01)
arr[::3] = np.nan
result = cut(arr, 4)
result_arr = np.asarray(result)
ex_arr = np.where(isna(arr), np.nan, result_arr)
tm.assert_almost_equal(result_arr, ex_arr)
result = cut(arr, 4, labels=False)
ex_result = np.where(isna(arr), np.nan, result)
tm.assert_almost_equal(result, ex_result)
def test_inf_handling(self):
data = np.arange(6)
data_ser = Series(data, dtype='int64')
bins = [-np.inf, 2, 4, np.inf]
result = cut(data, bins)
result_ser = cut(data_ser, bins)
ex_uniques = IntervalIndex.from_breaks(bins)
tm.assert_index_equal(result.categories, ex_uniques)
assert result[5] == Interval(4, np.inf)
assert result[0] == Interval(-np.inf, 2)
assert result_ser[5] == Interval(4, np.inf)
assert result_ser[0] == Interval(-np.inf, 2)
def test_qcut(self):
arr = np.random.randn(1000)
# We store the bins as Index that have been rounded
# to comparisons are a bit tricky.
labels, bins = qcut(arr, 4, retbins=True)
ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
result = labels.categories.left.values
assert np.allclose(result, ex_bins[:-1], atol=1e-2)
result = labels.categories.right.values
assert np.allclose(result, ex_bins[1:], atol=1e-2)
ex_levels = cut(arr, ex_bins, include_lowest=True)
tm.assert_categorical_equal(labels, ex_levels)
def test_qcut_bounds(self):
arr = np.random.randn(1000)
factor = qcut(arr, 10, labels=False)
assert len(np.unique(factor)) == 10
def test_qcut_specify_quantiles(self):
arr = np.random.randn(100)
factor = qcut(arr, [0, .25, .5, .75, 1.])
expected = qcut(arr, 4)
tm.assert_categorical_equal(factor, expected)
def test_qcut_all_bins_same(self):
tm.assert_raises_regex(ValueError, "edges.*unique", qcut,
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 3)
def test_cut_out_of_bounds(self):
arr = np.random.randn(100)
result = cut(arr, [-1, 0, 1])
mask = isna(result)
ex_mask = (arr < -1) | (arr > 1)
tm.assert_numpy_array_equal(mask, ex_mask)
def test_cut_pass_labels(self):
arr = [50, 5, 10, 15, 20, 30, 70]
bins = [0, 25, 50, 100]
labels = ['Small', 'Medium', 'Large']
result = cut(arr, bins, labels=labels)
exp = Categorical(['Medium'] + 4 * ['Small'] + ['Medium', 'Large'],
categories=labels,
ordered=True)
tm.assert_categorical_equal(result, exp)
result = cut(arr, bins, labels=Categorical.from_codes([0, 1, 2],
labels))
exp = Categorical.from_codes([1] + 4 * [0] + [1, 2], labels)
tm.assert_categorical_equal(result, exp)
# issue 16459
labels = ['Good', 'Medium', 'Bad']
result = cut(arr, 3, labels=labels)
exp = cut(arr, 3, labels=Categorical(labels, categories=labels,
ordered=True))
tm.assert_categorical_equal(result, exp)
def test_qcut_include_lowest(self):
values = np.arange(10)
ii = qcut(values, 4)
ex_levels = IntervalIndex(
[Interval(-0.001, 2.25),
Interval(2.25, 4.5),
Interval(4.5, 6.75),
Interval(6.75, 9)])
tm.assert_index_equal(ii.categories, ex_levels)
def test_qcut_nas(self):
arr = np.random.randn(100)
arr[:20] = np.nan
result = qcut(arr, 4)
assert isna(result[:20]).all()
def test_qcut_index(self):
result = qcut([0, 2], 2)
intervals = [Interval(-0.001, 1), Interval(1, 2)]
expected = Categorical(intervals, ordered=True)
tm.assert_categorical_equal(result, expected)
def test_round_frac(self):
# it works
result = cut(np.arange(11.), 2)
result = cut(np.arange(11.) / 1e10, 2)
# #1979, negative numbers
result = tmod._round_frac(-117.9998, precision=3)
assert result == -118
result = tmod._round_frac(117.9998, precision=3)
assert result == 118
result = tmod._round_frac(117.9998, precision=2)
assert result == 118
result = tmod._round_frac(0.000123456, precision=2)
assert result == 0.00012
def test_qcut_binning_issues(self, datapath):
# #1978, 1979
cut_file = datapath(os.path.join('reshape', 'data', 'cut_data.csv'))
arr = np.loadtxt(cut_file)
result = qcut(arr, 20)
starts = []
ends = []
for lev in np.unique(result):
s = lev.left
e = lev.right
assert s != e
starts.append(float(s))
ends.append(float(e))
for (sp, sn), (ep, en) in zip(zip(starts[:-1], starts[1:]),
zip(ends[:-1], ends[1:])):
assert sp < sn
assert ep < en
assert ep <= sn
def test_cut_return_intervals(self):
s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
res = cut(s, 3)
exp_bins = np.linspace(0, 8, num=4).round(3)
exp_bins[0] -= 0.008
exp = Series(IntervalIndex.from_breaks(exp_bins, closed='right').take(
[0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(CDT(ordered=True))
tm.assert_series_equal(res, exp)
def test_qcut_return_intervals(self):
s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
res = qcut(s, [0, 0.333, 0.666, 1])
exp_levels = np.array([Interval(-0.001, 2.664),
Interval(2.664, 5.328), Interval(5.328, 8)])
exp = Series(exp_levels.take([0, 0, 0, 1, 1, 1, 2, 2, 2])).astype(
CDT(ordered=True))
tm.assert_series_equal(res, exp)
def test_series_retbins(self):
# GH 8589
s = Series(np.arange(4))
result, bins = cut(s, 2, retbins=True)
expected = Series(IntervalIndex.from_breaks(
[-0.003, 1.5, 3], closed='right').repeat(2)).astype(
CDT(ordered=True))
tm.assert_series_equal(result, expected)
result, bins = qcut(s, 2, retbins=True)
expected = Series(IntervalIndex.from_breaks(
[-0.001, 1.5, 3], closed='right').repeat(2)).astype(
CDT(ordered=True))
tm.assert_series_equal(result, expected)
def test_cut_duplicates_bin(self):
# issue 20947
values = Series(np.array([1, 3, 5, 7, 9]),
index=["a", "b", "c", "d", "e"])
bins = [0, 2, 4, 6, 10, 10]
result = cut(values, bins, duplicates='drop')
expected = cut(values, pd.unique(bins))
tm.assert_series_equal(result, expected)
pytest.raises(ValueError, cut, values, bins)
pytest.raises(ValueError, cut, values, bins, duplicates='raise')
# invalid
pytest.raises(ValueError, cut, values, bins, duplicates='foo')
def test_qcut_duplicates_bin(self):
# GH 7751
values = [0, 0, 0, 0, 1, 2, 3]
expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)])
result = qcut(values, 3, duplicates='drop')
tm.assert_index_equal(result.categories, expected)
pytest.raises(ValueError, qcut, values, 3)
pytest.raises(ValueError, qcut, values, 3, duplicates='raise')
# invalid
pytest.raises(ValueError, qcut, values, 3, duplicates='foo')
def test_single_quantile(self):
# issue 15431
expected = Series([0, 0])
s = Series([9., 9.])
result = qcut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
result = qcut(s, 1)
intervals = IntervalIndex([Interval(8.999, 9.0),
Interval(8.999, 9.0)], closed='right')
expected = Series(intervals).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
s = Series([-9., -9.])
expected = Series([0, 0])
result = qcut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
result = qcut(s, 1)
intervals = IntervalIndex([Interval(-9.001, -9.0),
Interval(-9.001, -9.0)], closed='right')
expected = Series(intervals).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
s = Series([0., 0.])
expected = Series([0, 0])
result = qcut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
result = qcut(s, 1)
intervals = IntervalIndex([Interval(-0.001, 0.0),
Interval(-0.001, 0.0)], closed='right')
expected = Series(intervals).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
s = Series([9])
expected = Series([0])
result = qcut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
result = qcut(s, 1)
intervals = IntervalIndex([Interval(8.999, 9.0)], closed='right')
expected = Series(intervals).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
s = Series([-9])
expected = Series([0])
result = qcut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
result = qcut(s, 1)
intervals = IntervalIndex([Interval(-9.001, -9.0)], closed='right')
expected = Series(intervals).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
s = Series([0])
expected = Series([0])
result = qcut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
result = qcut(s, 1)
intervals = IntervalIndex([Interval(-0.001, 0.0)], closed='right')
expected = Series(intervals).astype(CDT(ordered=True))
tm.assert_series_equal(result, expected)
def test_single_bin(self):
# issue 14652
expected = Series([0, 0])
s = Series([9., 9.])
result = cut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
s = Series([-9., -9.])
result = cut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
expected = Series([0])
s = Series([9])
result = cut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
s = Series([-9])
result = cut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
# issue 15428
expected = Series([0, 0])
s = Series([0., 0.])
result = cut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
expected = Series([0])
s = Series([0])
result = cut(s, 1, labels=False)
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize(
"array_1_writeable, array_2_writeable",
[(True, True), (True, False), (False, False)])
def test_cut_read_only(self, array_1_writeable, array_2_writeable):
# issue 18773
array_1 = np.arange(0, 100, 10)
array_1.flags.writeable = array_1_writeable
array_2 = np.arange(0, 100, 10)
array_2.flags.writeable = array_2_writeable
hundred_elements = np.arange(100)
tm.assert_categorical_equal(cut(hundred_elements, array_1),
cut(hundred_elements, array_2))
class TestDatelike(object):
@pytest.mark.parametrize('s', [
Series(DatetimeIndex(['20180101', NaT, '20180103'])),
Series(TimedeltaIndex(['0 days', NaT, '2 days']))],
ids=lambda x: str(x.dtype))
def test_qcut_nat(self, s):
# GH 19768
intervals = IntervalIndex.from_tuples(
[(s[0] - Nano(), s[2] - Day()), np.nan, (s[2] - Day(), s[2])])
expected = Series(Categorical(intervals, ordered=True))
result = qcut(s, 2)
tm.assert_series_equal(result, expected)
def test_datetime_cut(self):
# GH 14714
# testing for time data to be present as series
data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03']))
result, bins = cut(data, 3, retbins=True)
expected = (
Series(IntervalIndex([
Interval(Timestamp('2012-12-31 23:57:07.200000'),
Timestamp('2013-01-01 16:00:00')),
Interval(Timestamp('2013-01-01 16:00:00'),
Timestamp('2013-01-02 08:00:00')),
Interval(Timestamp('2013-01-02 08:00:00'),
Timestamp('2013-01-03 00:00:00'))]))
.astype(CDT(ordered=True)))
tm.assert_series_equal(result, expected)
# testing for time data to be present as list
data = [np.datetime64('2013-01-01'), np.datetime64('2013-01-02'),
np.datetime64('2013-01-03')]
result, bins = cut(data, 3, retbins=True)
tm.assert_series_equal(Series(result), expected)
# testing for time data to be present as ndarray
data = np.array([np.datetime64('2013-01-01'),
np.datetime64('2013-01-02'),
np.datetime64('2013-01-03')])
result, bins = cut(data, 3, retbins=True)
tm.assert_series_equal(Series(result), expected)
# testing for time data to be present as datetime index
data = DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03'])
result, bins = cut(data, 3, retbins=True)
tm.assert_series_equal(Series(result), expected)
@pytest.mark.parametrize('bins', [
3, [Timestamp('2013-01-01 04:57:07.200000'),
Timestamp('2013-01-01 21:00:00'),
Timestamp('2013-01-02 13:00:00'),
Timestamp('2013-01-03 05:00:00')]])
@pytest.mark.parametrize('box', [list, np.array, Index, Series])
def test_datetimetz_cut(self, bins, box):
# GH 19872
tz = 'US/Eastern'
s = Series(date_range('20130101', periods=3, tz=tz))
if not isinstance(bins, int):
bins = box(bins)
result = cut(s, bins)
expected = (
Series(IntervalIndex([
Interval(Timestamp('2012-12-31 23:57:07.200000', tz=tz),
Timestamp('2013-01-01 16:00:00', tz=tz)),
Interval(Timestamp('2013-01-01 16:00:00', tz=tz),
Timestamp('2013-01-02 08:00:00', tz=tz)),
Interval(Timestamp('2013-01-02 08:00:00', tz=tz),
Timestamp('2013-01-03 00:00:00', tz=tz))]))
.astype(CDT(ordered=True)))
tm.assert_series_equal(result, expected)
@pytest.mark.parametrize('bins', [3, np.linspace(0, 1, 4)])
def test_datetimetz_qcut(self, bins):
# GH 19872
tz = 'US/Eastern'
s = Series(date_range('20130101', periods=3, tz=tz))
result = qcut(s, bins)
expected = (
Series(IntervalIndex([
Interval(Timestamp('2012-12-31 23:59:59.999999999', tz=tz),
Timestamp('2013-01-01 16:00:00', tz=tz)),
Interval(Timestamp('2013-01-01 16:00:00', tz=tz),
Timestamp('2013-01-02 08:00:00', tz=tz)),
Interval(Timestamp('2013-01-02 08:00:00', tz=tz),
Timestamp('2013-01-03 00:00:00', tz=tz))]))
.astype(CDT(ordered=True)))
tm.assert_series_equal(result, expected)
def test_datetime_bin(self):
data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')]
bin_data = ['2012-12-12', '2012-12-14', '2012-12-16']
expected = (
Series(IntervalIndex([
Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])),
Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2]))]))
.astype(CDT(ordered=True)))
for conv in [Timestamp, Timestamp, np.datetime64]:
bins = [conv(v) for v in bin_data]
result = cut(data, bins=bins)
tm.assert_series_equal(Series(result), expected)
bin_pydatetime = [Timestamp(v).to_pydatetime() for v in bin_data]
result = cut(data, bins=bin_pydatetime)
tm.assert_series_equal(Series(result), expected)
bins = to_datetime(bin_data)
result = cut(data, bins=bin_pydatetime)
tm.assert_series_equal(Series(result), expected)
def test_datetime_nan(self):
def f():
cut(date_range('20130101', periods=3), bins=[0, 2, 4])
pytest.raises(ValueError, f)
result = cut(date_range('20130102', periods=5),
bins=date_range('20130101', periods=2))
mask = result.categories.isna()
tm.assert_numpy_array_equal(mask, np.array([False]))
mask = result.isna()
tm.assert_numpy_array_equal(
mask, np.array([False, True, True, True, True]))