laywerrobot/lib/python3.6/site-packages/pandas/tests/internals/test_internals.py

1296 lines
48 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
# -*- coding: utf-8 -*-
# pylint: disable=W0102
from datetime import datetime, date
import operator
import sys
import pytest
import numpy as np
import re
from distutils.version import LooseVersion
import itertools
from pandas import (Index, MultiIndex, DataFrame, DatetimeIndex,
Series, Categorical, TimedeltaIndex, SparseArray)
from pandas.compat import OrderedDict, lrange
from pandas.core.internals import (BlockPlacement, SingleBlockManager,
make_block, BlockManager)
import pandas.core.algorithms as algos
import pandas.util.testing as tm
import pandas as pd
from pandas.util.testing import (assert_almost_equal, assert_frame_equal,
randn, assert_series_equal)
from pandas.compat import zip, u
# in 3.6.1 a c-api slicing function changed, see src/compat_helper.h
PY361 = LooseVersion(sys.version) >= LooseVersion('3.6.1')
@pytest.fixture
def mgr():
return create_mgr(
'a: f8; b: object; c: f8; d: object; e: f8;'
'f: bool; g: i8; h: complex; i: datetime-1; j: datetime-2;'
'k: M8[ns, US/Eastern]; l: M8[ns, CET];')
def assert_block_equal(left, right):
tm.assert_numpy_array_equal(left.values, right.values)
assert left.dtype == right.dtype
assert isinstance(left.mgr_locs, BlockPlacement)
assert isinstance(right.mgr_locs, BlockPlacement)
tm.assert_numpy_array_equal(left.mgr_locs.as_array,
right.mgr_locs.as_array)
def get_numeric_mat(shape):
arr = np.arange(shape[0])
return np.lib.stride_tricks.as_strided(x=arr, shape=shape, strides=(
arr.itemsize, ) + (0, ) * (len(shape) - 1)).copy()
N = 10
def create_block(typestr, placement, item_shape=None, num_offset=0):
"""
Supported typestr:
* float, f8, f4, f2
* int, i8, i4, i2, i1
* uint, u8, u4, u2, u1
* complex, c16, c8
* bool
* object, string, O
* datetime, dt, M8[ns], M8[ns, tz]
* timedelta, td, m8[ns]
* sparse (SparseArray with fill_value=0.0)
* sparse_na (SparseArray with fill_value=np.nan)
* category, category2
"""
placement = BlockPlacement(placement)
num_items = len(placement)
if item_shape is None:
item_shape = (N, )
shape = (num_items, ) + item_shape
mat = get_numeric_mat(shape)
if typestr in ('float', 'f8', 'f4', 'f2', 'int', 'i8', 'i4', 'i2', 'i1',
'uint', 'u8', 'u4', 'u2', 'u1'):
values = mat.astype(typestr) + num_offset
elif typestr in ('complex', 'c16', 'c8'):
values = 1.j * (mat.astype(typestr) + num_offset)
elif typestr in ('object', 'string', 'O'):
values = np.reshape(['A%d' % i for i in mat.ravel() + num_offset],
shape)
elif typestr in ('b', 'bool', ):
values = np.ones(shape, dtype=np.bool_)
elif typestr in ('datetime', 'dt', 'M8[ns]'):
values = (mat * 1e9).astype('M8[ns]')
elif typestr.startswith('M8[ns'):
# datetime with tz
m = re.search(r'M8\[ns,\s*(\w+\/?\w*)\]', typestr)
assert m is not None, "incompatible typestr -> {0}".format(typestr)
tz = m.groups()[0]
assert num_items == 1, "must have only 1 num items for a tz-aware"
values = DatetimeIndex(np.arange(N) * 1e9, tz=tz)
elif typestr in ('timedelta', 'td', 'm8[ns]'):
values = (mat * 1).astype('m8[ns]')
elif typestr in ('category', ):
values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4])
elif typestr in ('category2', ):
values = Categorical(['a', 'a', 'a', 'a', 'b', 'b', 'c', 'c', 'c', 'd'
])
elif typestr in ('sparse', 'sparse_na'):
# FIXME: doesn't support num_rows != 10
assert shape[-1] == 10
assert all(s == 1 for s in shape[:-1])
if typestr.endswith('_na'):
fill_value = np.nan
else:
fill_value = 0.0
values = SparseArray([fill_value, fill_value, 1, 2, 3, fill_value,
4, 5, fill_value, 6], fill_value=fill_value)
arr = values.sp_values.view()
arr += (num_offset - 1)
else:
raise ValueError('Unsupported typestr: "%s"' % typestr)
return make_block(values, placement=placement, ndim=len(shape))
def create_single_mgr(typestr, num_rows=None):
if num_rows is None:
num_rows = N
return SingleBlockManager(
create_block(typestr, placement=slice(0, num_rows), item_shape=()),
np.arange(num_rows))
def create_mgr(descr, item_shape=None):
"""
Construct BlockManager from string description.
String description syntax looks similar to np.matrix initializer. It looks
like this::
a,b,c: f8; d,e,f: i8
Rules are rather simple:
* see list of supported datatypes in `create_block` method
* components are semicolon-separated
* each component is `NAME,NAME,NAME: DTYPE_ID`
* whitespace around colons & semicolons are removed
* components with same DTYPE_ID are combined into single block
* to force multiple blocks with same dtype, use '-SUFFIX'::
'a:f8-1; b:f8-2; c:f8-foobar'
"""
if item_shape is None:
item_shape = (N, )
offset = 0
mgr_items = []
block_placements = OrderedDict()
for d in descr.split(';'):
d = d.strip()
if not len(d):
continue
names, blockstr = d.partition(':')[::2]
blockstr = blockstr.strip()
names = names.strip().split(',')
mgr_items.extend(names)
placement = list(np.arange(len(names)) + offset)
try:
block_placements[blockstr].extend(placement)
except KeyError:
block_placements[blockstr] = placement
offset += len(names)
mgr_items = Index(mgr_items)
blocks = []
num_offset = 0
for blockstr, placement in block_placements.items():
typestr = blockstr.split('-')[0]
blocks.append(create_block(typestr,
placement,
item_shape=item_shape,
num_offset=num_offset, ))
num_offset += len(placement)
return BlockManager(sorted(blocks, key=lambda b: b.mgr_locs[0]),
[mgr_items] + [np.arange(n) for n in item_shape])
class TestBlock(object):
def setup_method(self, method):
# self.fblock = get_float_ex() # a,c,e
# self.cblock = get_complex_ex() #
# self.oblock = get_obj_ex()
# self.bool_block = get_bool_ex()
# self.int_block = get_int_ex()
self.fblock = create_block('float', [0, 2, 4])
self.cblock = create_block('complex', [7])
self.oblock = create_block('object', [1, 3])
self.bool_block = create_block('bool', [5])
self.int_block = create_block('int', [6])
def test_constructor(self):
int32block = create_block('i4', [0])
assert int32block.dtype == np.int32
def test_pickle(self):
def _check(blk):
assert_block_equal(tm.round_trip_pickle(blk), blk)
_check(self.fblock)
_check(self.cblock)
_check(self.oblock)
_check(self.bool_block)
def test_mgr_locs(self):
assert isinstance(self.fblock.mgr_locs, BlockPlacement)
tm.assert_numpy_array_equal(self.fblock.mgr_locs.as_array,
np.array([0, 2, 4], dtype=np.int64))
def test_attrs(self):
assert self.fblock.shape == self.fblock.values.shape
assert self.fblock.dtype == self.fblock.values.dtype
assert len(self.fblock) == len(self.fblock.values)
def test_merge(self):
avals = randn(2, 10)
bvals = randn(2, 10)
ref_cols = Index(['e', 'a', 'b', 'd', 'f'])
ablock = make_block(avals, ref_cols.get_indexer(['e', 'b']))
bblock = make_block(bvals, ref_cols.get_indexer(['a', 'd']))
merged = ablock.merge(bblock)
tm.assert_numpy_array_equal(merged.mgr_locs.as_array,
np.array([0, 1, 2, 3], dtype=np.int64))
tm.assert_numpy_array_equal(merged.values[[0, 2]], np.array(avals))
tm.assert_numpy_array_equal(merged.values[[1, 3]], np.array(bvals))
# TODO: merge with mixed type?
def test_copy(self):
cop = self.fblock.copy()
assert cop is not self.fblock
assert_block_equal(self.fblock, cop)
def test_reindex_index(self):
pass
def test_reindex_cast(self):
pass
def test_insert(self):
pass
def test_delete(self):
newb = self.fblock.copy()
newb.delete(0)
assert isinstance(newb.mgr_locs, BlockPlacement)
tm.assert_numpy_array_equal(newb.mgr_locs.as_array,
np.array([2, 4], dtype=np.int64))
assert (newb.values[0] == 1).all()
newb = self.fblock.copy()
newb.delete(1)
assert isinstance(newb.mgr_locs, BlockPlacement)
tm.assert_numpy_array_equal(newb.mgr_locs.as_array,
np.array([0, 4], dtype=np.int64))
assert (newb.values[1] == 2).all()
newb = self.fblock.copy()
newb.delete(2)
tm.assert_numpy_array_equal(newb.mgr_locs.as_array,
np.array([0, 2], dtype=np.int64))
assert (newb.values[1] == 1).all()
newb = self.fblock.copy()
with pytest.raises(Exception):
newb.delete(3)
def test_make_block_same_class(self):
# issue 19431
block = create_block('M8[ns, US/Eastern]', [3])
with tm.assert_produces_warning(DeprecationWarning,
check_stacklevel=False):
block.make_block_same_class(block.values.values,
dtype=block.values.dtype)
class TestDatetimeBlock(object):
def test_try_coerce_arg(self):
block = create_block('datetime', [0])
# coerce None
none_coerced = block._try_coerce_args(block.values, None)[2]
assert pd.Timestamp(none_coerced) is pd.NaT
# coerce different types of date bojects
vals = (np.datetime64('2010-10-10'), datetime(2010, 10, 10),
date(2010, 10, 10))
for val in vals:
coerced = block._try_coerce_args(block.values, val)[2]
assert np.int64 == type(coerced)
assert pd.Timestamp('2010-10-10') == pd.Timestamp(coerced)
class TestBlockManager(object):
def test_constructor_corner(self):
pass
def test_attrs(self):
mgr = create_mgr('a,b,c: f8-1; d,e,f: f8-2')
assert mgr.nblocks == 2
assert len(mgr) == 6
def test_is_mixed_dtype(self):
assert not create_mgr('a,b:f8').is_mixed_type
assert not create_mgr('a:f8-1; b:f8-2').is_mixed_type
assert create_mgr('a,b:f8; c,d: f4').is_mixed_type
assert create_mgr('a,b:f8; c,d: object').is_mixed_type
def test_is_indexed_like(self):
mgr1 = create_mgr('a,b: f8')
mgr2 = create_mgr('a:i8; b:bool')
mgr3 = create_mgr('a,b,c: f8')
assert mgr1._is_indexed_like(mgr1)
assert mgr1._is_indexed_like(mgr2)
assert mgr1._is_indexed_like(mgr3)
assert not mgr1._is_indexed_like(mgr1.get_slice(
slice(-1), axis=1))
def test_duplicate_ref_loc_failure(self):
tmp_mgr = create_mgr('a:bool; a: f8')
axes, blocks = tmp_mgr.axes, tmp_mgr.blocks
blocks[0].mgr_locs = np.array([0])
blocks[1].mgr_locs = np.array([0])
# test trying to create block manager with overlapping ref locs
with pytest.raises(AssertionError):
BlockManager(blocks, axes)
blocks[0].mgr_locs = np.array([0])
blocks[1].mgr_locs = np.array([1])
mgr = BlockManager(blocks, axes)
mgr.iget(1)
def test_contains(self, mgr):
assert 'a' in mgr
assert 'baz' not in mgr
def test_pickle(self, mgr):
mgr2 = tm.round_trip_pickle(mgr)
assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
# share ref_items
# assert mgr2.blocks[0].ref_items is mgr2.blocks[1].ref_items
# GH2431
assert hasattr(mgr2, "_is_consolidated")
assert hasattr(mgr2, "_known_consolidated")
# reset to False on load
assert not mgr2._is_consolidated
assert not mgr2._known_consolidated
def test_non_unique_pickle(self):
mgr = create_mgr('a,a,a:f8')
mgr2 = tm.round_trip_pickle(mgr)
assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
mgr = create_mgr('a: f8; a: i8')
mgr2 = tm.round_trip_pickle(mgr)
assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
def test_categorical_block_pickle(self):
mgr = create_mgr('a: category')
mgr2 = tm.round_trip_pickle(mgr)
assert_frame_equal(DataFrame(mgr), DataFrame(mgr2))
smgr = create_single_mgr('category')
smgr2 = tm.round_trip_pickle(smgr)
assert_series_equal(Series(smgr), Series(smgr2))
def test_get_scalar(self, mgr):
for item in mgr.items:
for i, index in enumerate(mgr.axes[1]):
res = mgr.get_scalar((item, index))
exp = mgr.get(item, fastpath=False)[i]
assert res == exp
exp = mgr.get(item).internal_values()[i]
assert res == exp
def test_get(self):
cols = Index(list('abc'))
values = np.random.rand(3, 3)
block = make_block(values=values.copy(), placement=np.arange(3))
mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)])
assert_almost_equal(mgr.get('a', fastpath=False), values[0])
assert_almost_equal(mgr.get('b', fastpath=False), values[1])
assert_almost_equal(mgr.get('c', fastpath=False), values[2])
assert_almost_equal(mgr.get('a').internal_values(), values[0])
assert_almost_equal(mgr.get('b').internal_values(), values[1])
assert_almost_equal(mgr.get('c').internal_values(), values[2])
def test_set(self):
mgr = create_mgr('a,b,c: int', item_shape=(3, ))
mgr.set('d', np.array(['foo'] * 3))
mgr.set('b', np.array(['bar'] * 3))
tm.assert_numpy_array_equal(mgr.get('a').internal_values(),
np.array([0] * 3))
tm.assert_numpy_array_equal(mgr.get('b').internal_values(),
np.array(['bar'] * 3, dtype=np.object_))
tm.assert_numpy_array_equal(mgr.get('c').internal_values(),
np.array([2] * 3))
tm.assert_numpy_array_equal(mgr.get('d').internal_values(),
np.array(['foo'] * 3, dtype=np.object_))
def test_set_change_dtype(self, mgr):
mgr.set('baz', np.zeros(N, dtype=bool))
mgr.set('baz', np.repeat('foo', N))
assert mgr.get('baz').dtype == np.object_
mgr2 = mgr.consolidate()
mgr2.set('baz', np.repeat('foo', N))
assert mgr2.get('baz').dtype == np.object_
mgr2.set('quux', randn(N).astype(int))
assert mgr2.get('quux').dtype == np.int_
mgr2.set('quux', randn(N))
assert mgr2.get('quux').dtype == np.float_
def test_set_change_dtype_slice(self): # GH8850
cols = MultiIndex.from_tuples([('1st', 'a'), ('2nd', 'b'), ('3rd', 'c')
])
df = DataFrame([[1.0, 2, 3], [4.0, 5, 6]], columns=cols)
df['2nd'] = df['2nd'] * 2.0
blocks = df._to_dict_of_blocks()
assert sorted(blocks.keys()) == ['float64', 'int64']
assert_frame_equal(blocks['float64'], DataFrame(
[[1.0, 4.0], [4.0, 10.0]], columns=cols[:2]))
assert_frame_equal(blocks['int64'], DataFrame(
[[3], [6]], columns=cols[2:]))
def test_copy(self, mgr):
cp = mgr.copy(deep=False)
for blk, cp_blk in zip(mgr.blocks, cp.blocks):
# view assertion
assert cp_blk.equals(blk)
if isinstance(blk.values, np.ndarray):
assert cp_blk.values.base is blk.values.base
else:
# DatetimeTZBlock has DatetimeIndex values
assert cp_blk.values.values.base is blk.values.values.base
cp = mgr.copy(deep=True)
for blk, cp_blk in zip(mgr.blocks, cp.blocks):
# copy assertion we either have a None for a base or in case of
# some blocks it is an array (e.g. datetimetz), but was copied
assert cp_blk.equals(blk)
if not isinstance(cp_blk.values, np.ndarray):
assert cp_blk.values.values.base is not blk.values.values.base
else:
assert cp_blk.values.base is None and blk.values.base is None
def test_sparse(self):
mgr = create_mgr('a: sparse-1; b: sparse-2')
# what to test here?
assert mgr.as_array().dtype == np.float64
def test_sparse_mixed(self):
mgr = create_mgr('a: sparse-1; b: sparse-2; c: f8')
assert len(mgr.blocks) == 3
assert isinstance(mgr, BlockManager)
# what to test here?
def test_as_array_float(self):
mgr = create_mgr('c: f4; d: f2; e: f8')
assert mgr.as_array().dtype == np.float64
mgr = create_mgr('c: f4; d: f2')
assert mgr.as_array().dtype == np.float32
def test_as_array_int_bool(self):
mgr = create_mgr('a: bool-1; b: bool-2')
assert mgr.as_array().dtype == np.bool_
mgr = create_mgr('a: i8-1; b: i8-2; c: i4; d: i2; e: u1')
assert mgr.as_array().dtype == np.int64
mgr = create_mgr('c: i4; d: i2; e: u1')
assert mgr.as_array().dtype == np.int32
def test_as_array_datetime(self):
mgr = create_mgr('h: datetime-1; g: datetime-2')
assert mgr.as_array().dtype == 'M8[ns]'
def test_as_array_datetime_tz(self):
mgr = create_mgr('h: M8[ns, US/Eastern]; g: M8[ns, CET]')
assert mgr.get('h').dtype == 'datetime64[ns, US/Eastern]'
assert mgr.get('g').dtype == 'datetime64[ns, CET]'
assert mgr.as_array().dtype == 'object'
def test_astype(self):
# coerce all
mgr = create_mgr('c: f4; d: f2; e: f8')
for t in ['float16', 'float32', 'float64', 'int32', 'int64']:
t = np.dtype(t)
tmgr = mgr.astype(t)
assert tmgr.get('c').dtype.type == t
assert tmgr.get('d').dtype.type == t
assert tmgr.get('e').dtype.type == t
# mixed
mgr = create_mgr('a,b: object; c: bool; d: datetime;'
'e: f4; f: f2; g: f8')
for t in ['float16', 'float32', 'float64', 'int32', 'int64']:
t = np.dtype(t)
tmgr = mgr.astype(t, errors='ignore')
assert tmgr.get('c').dtype.type == t
assert tmgr.get('e').dtype.type == t
assert tmgr.get('f').dtype.type == t
assert tmgr.get('g').dtype.type == t
assert tmgr.get('a').dtype.type == np.object_
assert tmgr.get('b').dtype.type == np.object_
if t != np.int64:
assert tmgr.get('d').dtype.type == np.datetime64
else:
assert tmgr.get('d').dtype.type == t
def test_convert(self):
def _compare(old_mgr, new_mgr):
""" compare the blocks, numeric compare ==, object don't """
old_blocks = set(old_mgr.blocks)
new_blocks = set(new_mgr.blocks)
assert len(old_blocks) == len(new_blocks)
# compare non-numeric
for b in old_blocks:
found = False
for nb in new_blocks:
if (b.values == nb.values).all():
found = True
break
assert found
for b in new_blocks:
found = False
for ob in old_blocks:
if (b.values == ob.values).all():
found = True
break
assert found
# noops
mgr = create_mgr('f: i8; g: f8')
new_mgr = mgr.convert()
_compare(mgr, new_mgr)
mgr = create_mgr('a, b: object; f: i8; g: f8')
new_mgr = mgr.convert()
_compare(mgr, new_mgr)
# convert
mgr = create_mgr('a,b,foo: object; f: i8; g: f8')
mgr.set('a', np.array(['1'] * N, dtype=np.object_))
mgr.set('b', np.array(['2.'] * N, dtype=np.object_))
mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_))
new_mgr = mgr.convert(numeric=True)
assert new_mgr.get('a').dtype == np.int64
assert new_mgr.get('b').dtype == np.float64
assert new_mgr.get('foo').dtype == np.object_
assert new_mgr.get('f').dtype == np.int64
assert new_mgr.get('g').dtype == np.float64
mgr = create_mgr('a,b,foo: object; f: i4; bool: bool; dt: datetime;'
'i: i8; g: f8; h: f2')
mgr.set('a', np.array(['1'] * N, dtype=np.object_))
mgr.set('b', np.array(['2.'] * N, dtype=np.object_))
mgr.set('foo', np.array(['foo.'] * N, dtype=np.object_))
new_mgr = mgr.convert(numeric=True)
assert new_mgr.get('a').dtype == np.int64
assert new_mgr.get('b').dtype == np.float64
assert new_mgr.get('foo').dtype == np.object_
assert new_mgr.get('f').dtype == np.int32
assert new_mgr.get('bool').dtype == np.bool_
assert new_mgr.get('dt').dtype.type, np.datetime64
assert new_mgr.get('i').dtype == np.int64
assert new_mgr.get('g').dtype == np.float64
assert new_mgr.get('h').dtype == np.float16
def test_interleave(self):
# self
for dtype in ['f8', 'i8', 'object', 'bool', 'complex', 'M8[ns]',
'm8[ns]']:
mgr = create_mgr('a: {0}'.format(dtype))
assert mgr.as_array().dtype == dtype
mgr = create_mgr('a: {0}; b: {0}'.format(dtype))
assert mgr.as_array().dtype == dtype
# will be converted according the actual dtype of the underlying
mgr = create_mgr('a: category')
assert mgr.as_array().dtype == 'i8'
mgr = create_mgr('a: category; b: category')
assert mgr.as_array().dtype == 'i8'
mgr = create_mgr('a: category; b: category2')
assert mgr.as_array().dtype == 'object'
mgr = create_mgr('a: category2')
assert mgr.as_array().dtype == 'object'
mgr = create_mgr('a: category2; b: category2')
assert mgr.as_array().dtype == 'object'
# combinations
mgr = create_mgr('a: f8')
assert mgr.as_array().dtype == 'f8'
mgr = create_mgr('a: f8; b: i8')
assert mgr.as_array().dtype == 'f8'
mgr = create_mgr('a: f4; b: i8')
assert mgr.as_array().dtype == 'f8'
mgr = create_mgr('a: f4; b: i8; d: object')
assert mgr.as_array().dtype == 'object'
mgr = create_mgr('a: bool; b: i8')
assert mgr.as_array().dtype == 'object'
mgr = create_mgr('a: complex')
assert mgr.as_array().dtype == 'complex'
mgr = create_mgr('a: f8; b: category')
assert mgr.as_array().dtype == 'object'
mgr = create_mgr('a: M8[ns]; b: category')
assert mgr.as_array().dtype == 'object'
mgr = create_mgr('a: M8[ns]; b: bool')
assert mgr.as_array().dtype == 'object'
mgr = create_mgr('a: M8[ns]; b: i8')
assert mgr.as_array().dtype == 'object'
mgr = create_mgr('a: m8[ns]; b: bool')
assert mgr.as_array().dtype == 'object'
mgr = create_mgr('a: m8[ns]; b: i8')
assert mgr.as_array().dtype == 'object'
mgr = create_mgr('a: M8[ns]; b: m8[ns]')
assert mgr.as_array().dtype == 'object'
def test_interleave_non_unique_cols(self):
df = DataFrame([
[pd.Timestamp('20130101'), 3.5],
[pd.Timestamp('20130102'), 4.5]],
columns=['x', 'x'],
index=[1, 2])
df_unique = df.copy()
df_unique.columns = ['x', 'y']
assert df_unique.values.shape == df.values.shape
tm.assert_numpy_array_equal(df_unique.values[0], df.values[0])
tm.assert_numpy_array_equal(df_unique.values[1], df.values[1])
def test_consolidate(self):
pass
def test_consolidate_ordering_issues(self, mgr):
mgr.set('f', randn(N))
mgr.set('d', randn(N))
mgr.set('b', randn(N))
mgr.set('g', randn(N))
mgr.set('h', randn(N))
# we have datetime/tz blocks in mgr
cons = mgr.consolidate()
assert cons.nblocks == 4
cons = mgr.consolidate().get_numeric_data()
assert cons.nblocks == 1
assert isinstance(cons.blocks[0].mgr_locs, BlockPlacement)
tm.assert_numpy_array_equal(cons.blocks[0].mgr_locs.as_array,
np.arange(len(cons.items), dtype=np.int64))
def test_reindex_index(self):
pass
def test_reindex_items(self):
# mgr is not consolidated, f8 & f8-2 blocks
mgr = create_mgr('a: f8; b: i8; c: f8; d: i8; e: f8;'
'f: bool; g: f8-2')
reindexed = mgr.reindex_axis(['g', 'c', 'a', 'd'], axis=0)
assert reindexed.nblocks == 2
tm.assert_index_equal(reindexed.items, pd.Index(['g', 'c', 'a', 'd']))
assert_almost_equal(
mgr.get('g', fastpath=False), reindexed.get('g', fastpath=False))
assert_almost_equal(
mgr.get('c', fastpath=False), reindexed.get('c', fastpath=False))
assert_almost_equal(
mgr.get('a', fastpath=False), reindexed.get('a', fastpath=False))
assert_almost_equal(
mgr.get('d', fastpath=False), reindexed.get('d', fastpath=False))
assert_almost_equal(
mgr.get('g').internal_values(),
reindexed.get('g').internal_values())
assert_almost_equal(
mgr.get('c').internal_values(),
reindexed.get('c').internal_values())
assert_almost_equal(
mgr.get('a').internal_values(),
reindexed.get('a').internal_values())
assert_almost_equal(
mgr.get('d').internal_values(),
reindexed.get('d').internal_values())
def test_multiindex_xs(self):
mgr = create_mgr('a,b,c: f8; d,e,f: i8')
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two',
'three']],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=['first', 'second'])
mgr.set_axis(1, index)
result = mgr.xs('bar', axis=1)
assert result.shape == (6, 2)
assert result.axes[1][0] == ('bar', 'one')
assert result.axes[1][1] == ('bar', 'two')
def test_get_numeric_data(self):
mgr = create_mgr('int: int; float: float; complex: complex;'
'str: object; bool: bool; obj: object; dt: datetime',
item_shape=(3, ))
mgr.set('obj', np.array([1, 2, 3], dtype=np.object_))
numeric = mgr.get_numeric_data()
tm.assert_index_equal(numeric.items,
pd.Index(['int', 'float', 'complex', 'bool']))
assert_almost_equal(
mgr.get('float', fastpath=False), numeric.get('float',
fastpath=False))
assert_almost_equal(
mgr.get('float').internal_values(),
numeric.get('float').internal_values())
# Check sharing
numeric.set('float', np.array([100., 200., 300.]))
assert_almost_equal(
mgr.get('float', fastpath=False), np.array([100., 200., 300.]))
assert_almost_equal(
mgr.get('float').internal_values(), np.array([100., 200., 300.]))
numeric2 = mgr.get_numeric_data(copy=True)
tm.assert_index_equal(numeric.items,
pd.Index(['int', 'float', 'complex', 'bool']))
numeric2.set('float', np.array([1000., 2000., 3000.]))
assert_almost_equal(
mgr.get('float', fastpath=False), np.array([100., 200., 300.]))
assert_almost_equal(
mgr.get('float').internal_values(), np.array([100., 200., 300.]))
def test_get_bool_data(self):
mgr = create_mgr('int: int; float: float; complex: complex;'
'str: object; bool: bool; obj: object; dt: datetime',
item_shape=(3, ))
mgr.set('obj', np.array([True, False, True], dtype=np.object_))
bools = mgr.get_bool_data()
tm.assert_index_equal(bools.items, pd.Index(['bool']))
assert_almost_equal(mgr.get('bool', fastpath=False),
bools.get('bool', fastpath=False))
assert_almost_equal(
mgr.get('bool').internal_values(),
bools.get('bool').internal_values())
bools.set('bool', np.array([True, False, True]))
tm.assert_numpy_array_equal(mgr.get('bool', fastpath=False),
np.array([True, False, True]))
tm.assert_numpy_array_equal(mgr.get('bool').internal_values(),
np.array([True, False, True]))
# Check sharing
bools2 = mgr.get_bool_data(copy=True)
bools2.set('bool', np.array([False, True, False]))
tm.assert_numpy_array_equal(mgr.get('bool', fastpath=False),
np.array([True, False, True]))
tm.assert_numpy_array_equal(mgr.get('bool').internal_values(),
np.array([True, False, True]))
def test_unicode_repr_doesnt_raise(self):
repr(create_mgr(u('b,\u05d0: object')))
def test_missing_unicode_key(self):
df = DataFrame({"a": [1]})
try:
df.loc[:, u("\u05d0")] # should not raise UnicodeEncodeError
except KeyError:
pass # this is the expected exception
def test_equals(self):
# unique items
bm1 = create_mgr('a,b,c: i8-1; d,e,f: i8-2')
bm2 = BlockManager(bm1.blocks[::-1], bm1.axes)
assert bm1.equals(bm2)
bm1 = create_mgr('a,a,a: i8-1; b,b,b: i8-2')
bm2 = BlockManager(bm1.blocks[::-1], bm1.axes)
assert bm1.equals(bm2)
def test_equals_block_order_different_dtypes(self):
# GH 9330
mgr_strings = [
"a:i8;b:f8", # basic case
"a:i8;b:f8;c:c8;d:b", # many types
"a:i8;e:dt;f:td;g:string", # more types
"a:i8;b:category;c:category2;d:category2", # categories
"c:sparse;d:sparse_na;b:f8", # sparse
]
for mgr_string in mgr_strings:
bm = create_mgr(mgr_string)
block_perms = itertools.permutations(bm.blocks)
for bm_perm in block_perms:
bm_this = BlockManager(bm_perm, bm.axes)
assert bm.equals(bm_this)
assert bm_this.equals(bm)
def test_single_mgr_ctor(self):
mgr = create_single_mgr('f8', num_rows=5)
assert mgr.as_array().tolist() == [0., 1., 2., 3., 4.]
def test_validate_bool_args(self):
invalid_values = [1, "True", [1, 2, 3], 5.0]
bm1 = create_mgr('a,b,c: i8-1; d,e,f: i8-2')
for value in invalid_values:
with pytest.raises(ValueError):
bm1.replace_list([1], [2], inplace=value)
class TestIndexing(object):
# Nosetests-style data-driven tests.
#
# This test applies different indexing routines to block managers and
# compares the outcome to the result of same operations on np.ndarray.
#
# NOTE: sparse (SparseBlock with fill_value != np.nan) fail a lot of tests
# and are disabled.
MANAGERS = [
create_single_mgr('f8', N),
create_single_mgr('i8', N),
# create_single_mgr('sparse', N),
create_single_mgr('sparse_na', N),
# 2-dim
create_mgr('a,b,c,d,e,f: f8', item_shape=(N,)),
create_mgr('a,b,c,d,e,f: i8', item_shape=(N,)),
create_mgr('a,b: f8; c,d: i8; e,f: string', item_shape=(N,)),
create_mgr('a,b: f8; c,d: i8; e,f: f8', item_shape=(N,)),
# create_mgr('a: sparse', item_shape=(N,)),
create_mgr('a: sparse_na', item_shape=(N,)),
# 3-dim
create_mgr('a,b,c,d,e,f: f8', item_shape=(N, N)),
create_mgr('a,b,c,d,e,f: i8', item_shape=(N, N)),
create_mgr('a,b: f8; c,d: i8; e,f: string', item_shape=(N, N)),
create_mgr('a,b: f8; c,d: i8; e,f: f8', item_shape=(N, N)),
# create_mgr('a: sparse', item_shape=(1, N)),
]
# MANAGERS = [MANAGERS[6]]
def test_get_slice(self):
def assert_slice_ok(mgr, axis, slobj):
# import pudb; pudb.set_trace()
mat = mgr.as_array()
# we maybe using an ndarray to test slicing and
# might not be the full length of the axis
if isinstance(slobj, np.ndarray):
ax = mgr.axes[axis]
if len(ax) and len(slobj) and len(slobj) != len(ax):
slobj = np.concatenate([slobj, np.zeros(
len(ax) - len(slobj), dtype=bool)])
sliced = mgr.get_slice(slobj, axis=axis)
mat_slobj = (slice(None), ) * axis + (slobj, )
tm.assert_numpy_array_equal(mat[mat_slobj], sliced.as_array(),
check_dtype=False)
tm.assert_index_equal(mgr.axes[axis][slobj], sliced.axes[axis])
for mgr in self.MANAGERS:
for ax in range(mgr.ndim):
# slice
assert_slice_ok(mgr, ax, slice(None))
assert_slice_ok(mgr, ax, slice(3))
assert_slice_ok(mgr, ax, slice(100))
assert_slice_ok(mgr, ax, slice(1, 4))
assert_slice_ok(mgr, ax, slice(3, 0, -2))
# boolean mask
assert_slice_ok(
mgr, ax, np.array([], dtype=np.bool_))
assert_slice_ok(
mgr, ax,
np.ones(mgr.shape[ax], dtype=np.bool_))
assert_slice_ok(
mgr, ax,
np.zeros(mgr.shape[ax], dtype=np.bool_))
if mgr.shape[ax] >= 3:
assert_slice_ok(
mgr, ax,
np.arange(mgr.shape[ax]) % 3 == 0)
assert_slice_ok(
mgr, ax, np.array(
[True, True, False], dtype=np.bool_))
# fancy indexer
assert_slice_ok(mgr, ax, [])
assert_slice_ok(mgr, ax, lrange(mgr.shape[ax]))
if mgr.shape[ax] >= 3:
assert_slice_ok(mgr, ax, [0, 1, 2])
assert_slice_ok(mgr, ax, [-1, -2, -3])
def test_take(self):
def assert_take_ok(mgr, axis, indexer):
mat = mgr.as_array()
taken = mgr.take(indexer, axis)
tm.assert_numpy_array_equal(np.take(mat, indexer, axis),
taken.as_array(), check_dtype=False)
tm.assert_index_equal(mgr.axes[axis].take(indexer),
taken.axes[axis])
for mgr in self.MANAGERS:
for ax in range(mgr.ndim):
# take/fancy indexer
assert_take_ok(mgr, ax, [])
assert_take_ok(mgr, ax, [0, 0, 0])
assert_take_ok(mgr, ax, lrange(mgr.shape[ax]))
if mgr.shape[ax] >= 3:
assert_take_ok(mgr, ax, [0, 1, 2])
assert_take_ok(mgr, ax, [-1, -2, -3])
def test_reindex_axis(self):
def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value):
mat = mgr.as_array()
indexer = mgr.axes[axis].get_indexer_for(new_labels)
reindexed = mgr.reindex_axis(new_labels, axis,
fill_value=fill_value)
tm.assert_numpy_array_equal(algos.take_nd(mat, indexer, axis,
fill_value=fill_value),
reindexed.as_array(),
check_dtype=False)
tm.assert_index_equal(reindexed.axes[axis], new_labels)
for mgr in self.MANAGERS:
for ax in range(mgr.ndim):
for fill_value in (None, np.nan, 100.):
assert_reindex_axis_is_ok(
mgr, ax,
pd.Index([]), fill_value)
assert_reindex_axis_is_ok(
mgr, ax, mgr.axes[ax],
fill_value)
assert_reindex_axis_is_ok(
mgr, ax,
mgr.axes[ax][[0, 0, 0]], fill_value)
assert_reindex_axis_is_ok(
mgr, ax,
pd.Index(['foo', 'bar', 'baz']), fill_value)
assert_reindex_axis_is_ok(
mgr, ax,
pd.Index(['foo', mgr.axes[ax][0], 'baz']),
fill_value)
if mgr.shape[ax] >= 3:
assert_reindex_axis_is_ok(
mgr, ax,
mgr.axes[ax][:-3], fill_value)
assert_reindex_axis_is_ok(
mgr, ax,
mgr.axes[ax][-3::-1], fill_value)
assert_reindex_axis_is_ok(
mgr, ax,
mgr.axes[ax][[0, 1, 2, 0, 1, 2]], fill_value)
def test_reindex_indexer(self):
def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer,
fill_value):
mat = mgr.as_array()
reindexed_mat = algos.take_nd(mat, indexer, axis,
fill_value=fill_value)
reindexed = mgr.reindex_indexer(new_labels, indexer, axis,
fill_value=fill_value)
tm.assert_numpy_array_equal(reindexed_mat,
reindexed.as_array(),
check_dtype=False)
tm.assert_index_equal(reindexed.axes[axis], new_labels)
for mgr in self.MANAGERS:
for ax in range(mgr.ndim):
for fill_value in (None, np.nan, 100.):
assert_reindex_indexer_is_ok(
mgr, ax,
pd.Index([]), [], fill_value)
assert_reindex_indexer_is_ok(
mgr, ax,
mgr.axes[ax], np.arange(mgr.shape[ax]), fill_value)
assert_reindex_indexer_is_ok(
mgr, ax,
pd.Index(['foo'] * mgr.shape[ax]),
np.arange(mgr.shape[ax]), fill_value)
assert_reindex_indexer_is_ok(
mgr, ax,
mgr.axes[ax][::-1], np.arange(mgr.shape[ax]),
fill_value)
assert_reindex_indexer_is_ok(
mgr, ax, mgr.axes[ax],
np.arange(mgr.shape[ax])[::-1], fill_value)
assert_reindex_indexer_is_ok(
mgr, ax,
pd.Index(['foo', 'bar', 'baz']),
[0, 0, 0], fill_value)
assert_reindex_indexer_is_ok(
mgr, ax,
pd.Index(['foo', 'bar', 'baz']),
[-1, 0, -1], fill_value)
assert_reindex_indexer_is_ok(
mgr, ax,
pd.Index(['foo', mgr.axes[ax][0], 'baz']),
[-1, -1, -1], fill_value)
if mgr.shape[ax] >= 3:
assert_reindex_indexer_is_ok(
mgr, ax,
pd.Index(['foo', 'bar', 'baz']),
[0, 1, 2], fill_value)
# test_get_slice(slice_like, axis)
# take(indexer, axis)
# reindex_axis(new_labels, axis)
# reindex_indexer(new_labels, indexer, axis)
class TestBlockPlacement(object):
def test_slice_len(self):
assert len(BlockPlacement(slice(0, 4))) == 4
assert len(BlockPlacement(slice(0, 4, 2))) == 2
assert len(BlockPlacement(slice(0, 3, 2))) == 2
assert len(BlockPlacement(slice(0, 1, 2))) == 1
assert len(BlockPlacement(slice(1, 0, -1))) == 1
def test_zero_step_raises(self):
with pytest.raises(ValueError):
BlockPlacement(slice(1, 1, 0))
with pytest.raises(ValueError):
BlockPlacement(slice(1, 2, 0))
def test_unbounded_slice_raises(self):
def assert_unbounded_slice_error(slc):
tm.assert_raises_regex(ValueError, "unbounded slice",
lambda: BlockPlacement(slc))
assert_unbounded_slice_error(slice(None, None))
assert_unbounded_slice_error(slice(10, None))
assert_unbounded_slice_error(slice(None, None, -1))
assert_unbounded_slice_error(slice(None, 10, -1))
# These are "unbounded" because negative index will change depending on
# container shape.
assert_unbounded_slice_error(slice(-1, None))
assert_unbounded_slice_error(slice(None, -1))
assert_unbounded_slice_error(slice(-1, -1))
assert_unbounded_slice_error(slice(-1, None, -1))
assert_unbounded_slice_error(slice(None, -1, -1))
assert_unbounded_slice_error(slice(-1, -1, -1))
def test_not_slice_like_slices(self):
def assert_not_slice_like(slc):
assert not BlockPlacement(slc).is_slice_like
assert_not_slice_like(slice(0, 0))
assert_not_slice_like(slice(100, 0))
assert_not_slice_like(slice(100, 100, -1))
assert_not_slice_like(slice(0, 100, -1))
assert not BlockPlacement(slice(0, 0)).is_slice_like
assert not BlockPlacement(slice(100, 100)).is_slice_like
def test_array_to_slice_conversion(self):
def assert_as_slice_equals(arr, slc):
assert BlockPlacement(arr).as_slice == slc
assert_as_slice_equals([0], slice(0, 1, 1))
assert_as_slice_equals([100], slice(100, 101, 1))
assert_as_slice_equals([0, 1, 2], slice(0, 3, 1))
assert_as_slice_equals([0, 5, 10], slice(0, 15, 5))
assert_as_slice_equals([0, 100], slice(0, 200, 100))
assert_as_slice_equals([2, 1], slice(2, 0, -1))
if not PY361:
assert_as_slice_equals([2, 1, 0], slice(2, None, -1))
assert_as_slice_equals([100, 0], slice(100, None, -100))
def test_not_slice_like_arrays(self):
def assert_not_slice_like(arr):
assert not BlockPlacement(arr).is_slice_like
assert_not_slice_like([])
assert_not_slice_like([-1])
assert_not_slice_like([-1, -2, -3])
assert_not_slice_like([-10])
assert_not_slice_like([-1])
assert_not_slice_like([-1, 0, 1, 2])
assert_not_slice_like([-2, 0, 2, 4])
assert_not_slice_like([1, 0, -1])
assert_not_slice_like([1, 1, 1])
def test_slice_iter(self):
assert list(BlockPlacement(slice(0, 3))) == [0, 1, 2]
assert list(BlockPlacement(slice(0, 0))) == []
assert list(BlockPlacement(slice(3, 0))) == []
if not PY361:
assert list(BlockPlacement(slice(3, 0, -1))) == [3, 2, 1]
assert list(BlockPlacement(slice(3, None, -1))) == [3, 2, 1, 0]
def test_slice_to_array_conversion(self):
def assert_as_array_equals(slc, asarray):
tm.assert_numpy_array_equal(
BlockPlacement(slc).as_array,
np.asarray(asarray, dtype=np.int64))
assert_as_array_equals(slice(0, 3), [0, 1, 2])
assert_as_array_equals(slice(0, 0), [])
assert_as_array_equals(slice(3, 0), [])
assert_as_array_equals(slice(3, 0, -1), [3, 2, 1])
if not PY361:
assert_as_array_equals(slice(3, None, -1), [3, 2, 1, 0])
assert_as_array_equals(slice(31, None, -10), [31, 21, 11, 1])
def test_blockplacement_add(self):
bpl = BlockPlacement(slice(0, 5))
assert bpl.add(1).as_slice == slice(1, 6, 1)
assert bpl.add(np.arange(5)).as_slice == slice(0, 10, 2)
assert list(bpl.add(np.arange(5, 0, -1))) == [5, 5, 5, 5, 5]
def test_blockplacement_add_int(self):
def assert_add_equals(val, inc, result):
assert list(BlockPlacement(val).add(inc)) == result
assert_add_equals(slice(0, 0), 0, [])
assert_add_equals(slice(1, 4), 0, [1, 2, 3])
assert_add_equals(slice(3, 0, -1), 0, [3, 2, 1])
assert_add_equals([1, 2, 4], 0, [1, 2, 4])
assert_add_equals(slice(0, 0), 10, [])
assert_add_equals(slice(1, 4), 10, [11, 12, 13])
assert_add_equals(slice(3, 0, -1), 10, [13, 12, 11])
assert_add_equals([1, 2, 4], 10, [11, 12, 14])
assert_add_equals(slice(0, 0), -1, [])
assert_add_equals(slice(1, 4), -1, [0, 1, 2])
assert_add_equals([1, 2, 4], -1, [0, 1, 3])
with pytest.raises(ValueError):
BlockPlacement(slice(1, 4)).add(-10)
with pytest.raises(ValueError):
BlockPlacement([1, 2, 4]).add(-10)
if not PY361:
assert_add_equals(slice(3, 0, -1), -1, [2, 1, 0])
assert_add_equals(slice(2, None, -1), 0, [2, 1, 0])
assert_add_equals(slice(2, None, -1), 10, [12, 11, 10])
with pytest.raises(ValueError):
BlockPlacement(slice(2, None, -1)).add(-1)
class DummyElement(object):
def __init__(self, value, dtype):
self.value = value
self.dtype = np.dtype(dtype)
def __array__(self):
return np.array(self.value, dtype=self.dtype)
def __str__(self):
return "DummyElement({}, {})".format(self.value, self.dtype)
def __repr__(self):
return str(self)
def astype(self, dtype, copy=False):
self.dtype = dtype
return self
def view(self, dtype):
return type(self)(self.value.view(dtype), dtype)
def any(self, axis=None):
return bool(self.value)
class TestCanHoldElement(object):
@pytest.mark.parametrize('value, dtype', [
(1, 'i8'),
(1.0, 'f8'),
(2**63, 'f8'),
(1j, 'complex128'),
(2**63, 'complex128'),
(True, 'bool'),
(np.timedelta64(20, 'ns'), '<m8[ns]'),
(np.datetime64(20, 'ns'), '<M8[ns]'),
])
@pytest.mark.parametrize('op', [
operator.add,
operator.sub,
operator.mul,
operator.truediv,
operator.mod,
operator.pow,
], ids=lambda x: x.__name__)
def test_binop_other(self, op, value, dtype):
skip = {(operator.add, 'bool'),
(operator.sub, 'bool'),
(operator.mul, 'bool'),
(operator.truediv, 'bool'),
(operator.mod, 'i8'),
(operator.mod, 'complex128'),
(operator.mod, '<M8[ns]'),
(operator.mod, '<m8[ns]'),
(operator.pow, 'bool')}
if (op, dtype) in skip:
pytest.skip("Invalid combination {},{}".format(op, dtype))
e = DummyElement(value, dtype)
s = pd.DataFrame({"A": [e.value, e.value]}, dtype=e.dtype)
result = op(s, e).dtypes
expected = op(s, value).dtypes
assert_series_equal(result, expected)
@pytest.mark.parametrize('typestr, holder', [
('category', Categorical),
('M8[ns]', DatetimeIndex),
('M8[ns, US/Central]', DatetimeIndex),
('m8[ns]', TimedeltaIndex),
('sparse', SparseArray),
])
def test_holder(typestr, holder):
blk = create_block(typestr, [1])
assert blk._holder is holder
def test_deprecated_fastpath():
# GH#19265
values = np.random.rand(3, 3)
with tm.assert_produces_warning(DeprecationWarning,
check_stacklevel=False):
make_block(values, placement=np.arange(3), fastpath=True)
def test_validate_ndim():
values = np.array([1.0, 2.0])
placement = slice(2)
msg = "Wrong number of dimensions. values.ndim != ndim \[1 != 2\]"
with tm.assert_raises_regex(ValueError, msg):
make_block(values, placement, ndim=2)