laywerrobot/lib/python3.6/site-packages/pandas/tests/test_expressions.py

445 lines
17 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
# -*- coding: utf-8 -*-
from __future__ import print_function
# pylint: disable-msg=W0612,E1101
from warnings import catch_warnings
import re
import operator
import pytest
from numpy.random import randn
import numpy as np
from pandas.core.api import DataFrame, Panel
from pandas.core.computation import expressions as expr
from pandas import compat, _np_version_under1p11, _np_version_under1p13
from pandas.util.testing import (assert_almost_equal, assert_series_equal,
assert_frame_equal, assert_panel_equal)
from pandas.io.formats.printing import pprint_thing
import pandas.util.testing as tm
_frame = DataFrame(randn(10000, 4), columns=list('ABCD'), dtype='float64')
_frame2 = DataFrame(randn(100, 4), columns=list('ABCD'), dtype='float64')
_mixed = DataFrame({'A': _frame['A'].copy(),
'B': _frame['B'].astype('float32'),
'C': _frame['C'].astype('int64'),
'D': _frame['D'].astype('int32')})
_mixed2 = DataFrame({'A': _frame2['A'].copy(),
'B': _frame2['B'].astype('float32'),
'C': _frame2['C'].astype('int64'),
'D': _frame2['D'].astype('int32')})
_integer = DataFrame(
np.random.randint(1, 100,
size=(10001, 4)),
columns=list('ABCD'), dtype='int64')
_integer2 = DataFrame(np.random.randint(1, 100, size=(101, 4)),
columns=list('ABCD'), dtype='int64')
with catch_warnings(record=True):
_frame_panel = Panel(dict(ItemA=_frame.copy(),
ItemB=(_frame.copy() + 3),
ItemC=_frame.copy(),
ItemD=_frame.copy()))
_frame2_panel = Panel(dict(ItemA=_frame2.copy(),
ItemB=(_frame2.copy() + 3),
ItemC=_frame2.copy(),
ItemD=_frame2.copy()))
_integer_panel = Panel(dict(ItemA=_integer,
ItemB=(_integer + 34).astype('int64')))
_integer2_panel = Panel(dict(ItemA=_integer2,
ItemB=(_integer2 + 34).astype('int64')))
_mixed_panel = Panel(dict(ItemA=_mixed, ItemB=(_mixed + 3)))
_mixed2_panel = Panel(dict(ItemA=_mixed2, ItemB=(_mixed2 + 3)))
@pytest.mark.skipif(not expr._USE_NUMEXPR, reason='not using numexpr')
class TestExpressions(object):
def setup_method(self, method):
self.frame = _frame.copy()
self.frame2 = _frame2.copy()
self.mixed = _mixed.copy()
self.mixed2 = _mixed2.copy()
self.integer = _integer.copy()
self._MIN_ELEMENTS = expr._MIN_ELEMENTS
def teardown_method(self, method):
expr._MIN_ELEMENTS = self._MIN_ELEMENTS
def run_arithmetic(self, df, other, assert_func, check_dtype=False,
test_flex=True):
expr._MIN_ELEMENTS = 0
operations = ['add', 'sub', 'mul', 'mod', 'truediv', 'floordiv']
if not compat.PY3:
operations.append('div')
for arith in operations:
operator_name = arith
if arith == 'div':
operator_name = 'truediv'
if test_flex:
op = lambda x, y: getattr(df, arith)(y)
op.__name__ = arith
else:
op = getattr(operator, operator_name)
expr.set_use_numexpr(False)
expected = op(df, other)
expr.set_use_numexpr(True)
result = op(df, other)
try:
if check_dtype:
if arith == 'truediv':
assert expected.dtype.kind == 'f'
assert_func(expected, result)
except Exception:
pprint_thing("Failed test with operator %r" % op.__name__)
raise
def test_integer_arithmetic(self):
self.run_arithmetic(self.integer, self.integer,
assert_frame_equal)
self.run_arithmetic(self.integer.iloc[:, 0],
self.integer.iloc[:, 0], assert_series_equal,
check_dtype=True)
def run_binary(self, df, other, assert_func, test_flex=False,
numexpr_ops=set(['gt', 'lt', 'ge', 'le', 'eq', 'ne'])):
"""
tests solely that the result is the same whether or not numexpr is
enabled. Need to test whether the function does the correct thing
elsewhere.
"""
expr._MIN_ELEMENTS = 0
expr.set_test_mode(True)
operations = ['gt', 'lt', 'ge', 'le', 'eq', 'ne']
for arith in operations:
if test_flex:
op = lambda x, y: getattr(df, arith)(y)
op.__name__ = arith
else:
op = getattr(operator, arith)
expr.set_use_numexpr(False)
expected = op(df, other)
expr.set_use_numexpr(True)
expr.get_test_result()
result = op(df, other)
used_numexpr = expr.get_test_result()
try:
if arith in numexpr_ops:
assert used_numexpr, "Did not use numexpr as expected."
else:
assert not used_numexpr, "Used numexpr unexpectedly."
assert_func(expected, result)
except Exception:
pprint_thing("Failed test with operation %r" % arith)
pprint_thing("test_flex was %r" % test_flex)
raise
def run_frame(self, df, other, binary_comp=None, run_binary=True,
**kwargs):
self.run_arithmetic(df, other, assert_frame_equal,
test_flex=False, **kwargs)
self.run_arithmetic(df, other, assert_frame_equal, test_flex=True,
**kwargs)
if run_binary:
if binary_comp is None:
expr.set_use_numexpr(False)
binary_comp = other + 1
expr.set_use_numexpr(True)
self.run_binary(df, binary_comp, assert_frame_equal,
test_flex=False, **kwargs)
self.run_binary(df, binary_comp, assert_frame_equal,
test_flex=True, **kwargs)
def run_series(self, ser, other, binary_comp=None, **kwargs):
self.run_arithmetic(ser, other, assert_series_equal,
test_flex=False, **kwargs)
self.run_arithmetic(ser, other, assert_almost_equal,
test_flex=True, **kwargs)
# series doesn't uses vec_compare instead of numexpr...
# if binary_comp is None:
# binary_comp = other + 1
# self.run_binary(ser, binary_comp, assert_frame_equal,
# test_flex=False, **kwargs)
# self.run_binary(ser, binary_comp, assert_frame_equal,
# test_flex=True, **kwargs)
def run_panel(self, panel, other, binary_comp=None, run_binary=True,
assert_func=assert_panel_equal, **kwargs):
self.run_arithmetic(panel, other, assert_func, test_flex=False,
**kwargs)
self.run_arithmetic(panel, other, assert_func, test_flex=True,
**kwargs)
if run_binary:
if binary_comp is None:
binary_comp = other + 1
self.run_binary(panel, binary_comp, assert_func,
test_flex=False, **kwargs)
self.run_binary(panel, binary_comp, assert_func,
test_flex=True, **kwargs)
def test_integer_arithmetic_frame(self):
self.run_frame(self.integer, self.integer)
def test_integer_arithmetic_series(self):
self.run_series(self.integer.iloc[:, 0], self.integer.iloc[:, 0])
@pytest.mark.slow
def test_integer_panel(self):
self.run_panel(_integer2_panel, np.random.randint(1, 100))
def test_float_arithemtic_frame(self):
self.run_frame(self.frame2, self.frame2)
def test_float_arithmetic_series(self):
self.run_series(self.frame2.iloc[:, 0], self.frame2.iloc[:, 0])
@pytest.mark.slow
def test_float_panel(self):
self.run_panel(_frame2_panel, np.random.randn() + 0.1, binary_comp=0.8)
def test_mixed_arithmetic_frame(self):
# TODO: FIGURE OUT HOW TO GET IT TO WORK...
# can't do arithmetic because comparison methods try to do *entire*
# frame instead of by-column
self.run_frame(self.mixed2, self.mixed2, run_binary=False)
def test_mixed_arithmetic_series(self):
for col in self.mixed2.columns:
self.run_series(self.mixed2[col], self.mixed2[col], binary_comp=4)
@pytest.mark.slow
def test_mixed_panel(self):
self.run_panel(_mixed2_panel, np.random.randint(1, 100),
binary_comp=-2)
def test_float_arithemtic(self):
self.run_arithmetic(self.frame, self.frame, assert_frame_equal)
self.run_arithmetic(self.frame.iloc[:, 0], self.frame.iloc[:, 0],
assert_series_equal, check_dtype=True)
def test_mixed_arithmetic(self):
self.run_arithmetic(self.mixed, self.mixed, assert_frame_equal)
for col in self.mixed.columns:
self.run_arithmetic(self.mixed[col], self.mixed[col],
assert_series_equal)
def test_integer_with_zeros(self):
self.integer *= np.random.randint(0, 2, size=np.shape(self.integer))
self.run_arithmetic(self.integer, self.integer,
assert_frame_equal)
self.run_arithmetic(self.integer.iloc[:, 0],
self.integer.iloc[:, 0], assert_series_equal)
def test_invalid(self):
# no op
result = expr._can_use_numexpr(operator.add, None, self.frame,
self.frame, 'evaluate')
assert not result
# mixed
result = expr._can_use_numexpr(operator.add, '+', self.mixed,
self.frame, 'evaluate')
assert not result
# min elements
result = expr._can_use_numexpr(operator.add, '+', self.frame2,
self.frame2, 'evaluate')
assert not result
# ok, we only check on first part of expression
result = expr._can_use_numexpr(operator.add, '+', self.frame,
self.frame2, 'evaluate')
assert result
def test_binary_ops(self):
def testit():
for f, f2 in [(self.frame, self.frame2),
(self.mixed, self.mixed2)]:
for op, op_str in [('add', '+'), ('sub', '-'), ('mul', '*'),
('div', '/'), ('pow', '**')]:
# numpy >= 1.11 doesn't handle integers
# raised to integer powers
# https://github.com/pandas-dev/pandas/issues/15363
if op == 'pow' and not _np_version_under1p11:
continue
if op == 'div':
op = getattr(operator, 'truediv', None)
else:
op = getattr(operator, op, None)
if op is not None:
result = expr._can_use_numexpr(op, op_str, f, f,
'evaluate')
assert result != f._is_mixed_type
result = expr.evaluate(op, op_str, f, f,
use_numexpr=True)
expected = expr.evaluate(op, op_str, f, f,
use_numexpr=False)
if isinstance(result, DataFrame):
tm.assert_frame_equal(result, expected)
else:
tm.assert_numpy_array_equal(result,
expected.values)
result = expr._can_use_numexpr(op, op_str, f2, f2,
'evaluate')
assert not result
expr.set_use_numexpr(False)
testit()
expr.set_use_numexpr(True)
expr.set_numexpr_threads(1)
testit()
expr.set_numexpr_threads()
testit()
def test_boolean_ops(self):
def testit():
for f, f2 in [(self.frame, self.frame2),
(self.mixed, self.mixed2)]:
f11 = f
f12 = f + 1
f21 = f2
f22 = f2 + 1
for op, op_str in [('gt', '>'), ('lt', '<'), ('ge', '>='),
('le', '<='), ('eq', '=='), ('ne', '!=')]:
op = getattr(operator, op)
result = expr._can_use_numexpr(op, op_str, f11, f12,
'evaluate')
assert result != f11._is_mixed_type
result = expr.evaluate(op, op_str, f11, f12,
use_numexpr=True)
expected = expr.evaluate(op, op_str, f11, f12,
use_numexpr=False)
if isinstance(result, DataFrame):
tm.assert_frame_equal(result, expected)
else:
tm.assert_numpy_array_equal(result, expected.values)
result = expr._can_use_numexpr(op, op_str, f21, f22,
'evaluate')
assert not result
expr.set_use_numexpr(False)
testit()
expr.set_use_numexpr(True)
expr.set_numexpr_threads(1)
testit()
expr.set_numexpr_threads()
testit()
def test_where(self):
def testit():
for f in [self.frame, self.frame2, self.mixed, self.mixed2]:
for cond in [True, False]:
c = np.empty(f.shape, dtype=np.bool_)
c.fill(cond)
result = expr.where(c, f.values, f.values + 1)
expected = np.where(c, f.values, f.values + 1)
tm.assert_numpy_array_equal(result, expected)
expr.set_use_numexpr(False)
testit()
expr.set_use_numexpr(True)
expr.set_numexpr_threads(1)
testit()
expr.set_numexpr_threads()
testit()
def test_bool_ops_raise_on_arithmetic(self):
df = DataFrame({'a': np.random.rand(10) > 0.5,
'b': np.random.rand(10) > 0.5})
names = 'div', 'truediv', 'floordiv', 'pow'
ops = '/', '/', '//', '**'
msg = 'operator %r not implemented for bool dtypes'
for op, name in zip(ops, names):
if not compat.PY3 or name != 'div':
f = getattr(operator, name)
err_msg = re.escape(msg % op)
with tm.assert_raises_regex(NotImplementedError, err_msg):
f(df, df)
with tm.assert_raises_regex(NotImplementedError, err_msg):
f(df.a, df.b)
with tm.assert_raises_regex(NotImplementedError, err_msg):
f(df.a, True)
with tm.assert_raises_regex(NotImplementedError, err_msg):
f(False, df.a)
with tm.assert_raises_regex(TypeError, err_msg):
f(False, df)
with tm.assert_raises_regex(TypeError, err_msg):
f(df, True)
def test_bool_ops_warn_on_arithmetic(self):
n = 10
df = DataFrame({'a': np.random.rand(n) > 0.5,
'b': np.random.rand(n) > 0.5})
names = 'add', 'mul', 'sub'
ops = '+', '*', '-'
subs = {'+': '|', '*': '&', '-': '^'}
sub_funcs = {'|': 'or_', '&': 'and_', '^': 'xor'}
for op, name in zip(ops, names):
f = getattr(operator, name)
fe = getattr(operator, sub_funcs[subs[op]])
# >= 1.13.0 these are now TypeErrors
if op == '-' and not _np_version_under1p13:
continue
with tm.use_numexpr(True, min_elements=5):
with tm.assert_produces_warning(check_stacklevel=False):
r = f(df, df)
e = fe(df, df)
tm.assert_frame_equal(r, e)
with tm.assert_produces_warning(check_stacklevel=False):
r = f(df.a, df.b)
e = fe(df.a, df.b)
tm.assert_series_equal(r, e)
with tm.assert_produces_warning(check_stacklevel=False):
r = f(df.a, True)
e = fe(df.a, True)
tm.assert_series_equal(r, e)
with tm.assert_produces_warning(check_stacklevel=False):
r = f(False, df.a)
e = fe(False, df.a)
tm.assert_series_equal(r, e)
with tm.assert_produces_warning(check_stacklevel=False):
r = f(False, df)
e = fe(False, df)
tm.assert_frame_equal(r, e)
with tm.assert_produces_warning(check_stacklevel=False):
r = f(df, True)
e = fe(df, True)
tm.assert_frame_equal(r, e)