5903 lines
195 KiB
Python
5903 lines
195 KiB
Python
|
import warnings
|
||
|
import copy
|
||
|
from warnings import catch_warnings
|
||
|
import inspect
|
||
|
import itertools
|
||
|
import re
|
||
|
import operator
|
||
|
from datetime import datetime, timedelta, date
|
||
|
from collections import defaultdict
|
||
|
from functools import partial
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from pandas._libs import internals as libinternals
|
||
|
|
||
|
from pandas.core.base import PandasObject
|
||
|
|
||
|
from pandas.core.dtypes.dtypes import (
|
||
|
ExtensionDtype, DatetimeTZDtype,
|
||
|
PandasExtensionDtype,
|
||
|
CategoricalDtype)
|
||
|
from pandas.core.dtypes.common import (
|
||
|
_TD_DTYPE, _NS_DTYPE,
|
||
|
_ensure_int64, _ensure_platform_int,
|
||
|
is_integer,
|
||
|
is_dtype_equal,
|
||
|
is_timedelta64_dtype,
|
||
|
is_datetime64_dtype, is_datetimetz, is_sparse,
|
||
|
is_categorical, is_categorical_dtype,
|
||
|
is_integer_dtype,
|
||
|
is_datetime64tz_dtype,
|
||
|
is_bool_dtype,
|
||
|
is_object_dtype,
|
||
|
is_datetimelike_v_numeric,
|
||
|
is_float_dtype, is_numeric_dtype,
|
||
|
is_numeric_v_string_like, is_extension_type,
|
||
|
is_extension_array_dtype,
|
||
|
is_list_like,
|
||
|
is_re,
|
||
|
is_re_compilable,
|
||
|
is_scalar,
|
||
|
_get_dtype)
|
||
|
from pandas.core.dtypes.cast import (
|
||
|
maybe_downcast_to_dtype,
|
||
|
maybe_upcast,
|
||
|
maybe_promote,
|
||
|
infer_dtype_from,
|
||
|
infer_dtype_from_scalar,
|
||
|
soft_convert_objects,
|
||
|
maybe_convert_objects,
|
||
|
astype_nansafe,
|
||
|
find_common_type,
|
||
|
maybe_infer_dtype_type)
|
||
|
from pandas.core.dtypes.missing import (
|
||
|
isna, notna, array_equivalent,
|
||
|
_isna_compat,
|
||
|
is_null_datelike_scalar)
|
||
|
import pandas.core.dtypes.concat as _concat
|
||
|
|
||
|
from pandas.core.dtypes.generic import (
|
||
|
ABCSeries,
|
||
|
ABCDatetimeIndex,
|
||
|
ABCExtensionArray,
|
||
|
ABCIndexClass)
|
||
|
import pandas.core.common as com
|
||
|
import pandas.core.algorithms as algos
|
||
|
|
||
|
from pandas.core.index import Index, MultiIndex, _ensure_index
|
||
|
from pandas.core.indexing import maybe_convert_indices, check_setitem_lengths
|
||
|
from pandas.core.arrays import Categorical
|
||
|
from pandas.core.indexes.datetimes import DatetimeIndex
|
||
|
from pandas.core.indexes.timedeltas import TimedeltaIndex
|
||
|
from pandas.io.formats.printing import pprint_thing
|
||
|
|
||
|
import pandas.core.missing as missing
|
||
|
from pandas.core.sparse.array import _maybe_to_sparse, SparseArray
|
||
|
from pandas._libs import lib, tslib
|
||
|
from pandas._libs.tslib import Timedelta
|
||
|
from pandas._libs.internals import BlockPlacement
|
||
|
from pandas._libs.tslibs import conversion
|
||
|
|
||
|
from pandas.util._decorators import cache_readonly
|
||
|
from pandas.util._validators import validate_bool_kwarg
|
||
|
from pandas import compat
|
||
|
from pandas.compat import range, map, zip, u
|
||
|
|
||
|
|
||
|
class Block(PandasObject):
|
||
|
"""
|
||
|
Canonical n-dimensional unit of homogeneous dtype contained in a pandas
|
||
|
data structure
|
||
|
|
||
|
Index-ignorant; let the container take care of that
|
||
|
"""
|
||
|
__slots__ = ['_mgr_locs', 'values', 'ndim']
|
||
|
is_numeric = False
|
||
|
is_float = False
|
||
|
is_integer = False
|
||
|
is_complex = False
|
||
|
is_datetime = False
|
||
|
is_datetimetz = False
|
||
|
is_timedelta = False
|
||
|
is_bool = False
|
||
|
is_object = False
|
||
|
is_categorical = False
|
||
|
is_sparse = False
|
||
|
is_extension = False
|
||
|
_box_to_block_values = True
|
||
|
_can_hold_na = False
|
||
|
_can_consolidate = True
|
||
|
_verify_integrity = True
|
||
|
_validate_ndim = True
|
||
|
_ftype = 'dense'
|
||
|
_concatenator = staticmethod(np.concatenate)
|
||
|
|
||
|
def __init__(self, values, placement, ndim=None):
|
||
|
self.ndim = self._check_ndim(values, ndim)
|
||
|
self.mgr_locs = placement
|
||
|
self.values = values
|
||
|
|
||
|
if (self._validate_ndim and self.ndim and
|
||
|
len(self.mgr_locs) != len(self.values)):
|
||
|
raise ValueError(
|
||
|
'Wrong number of items passed {val}, placement implies '
|
||
|
'{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs)))
|
||
|
|
||
|
def _check_ndim(self, values, ndim):
|
||
|
"""ndim inference and validation.
|
||
|
|
||
|
Infers ndim from 'values' if not provided to __init__.
|
||
|
Validates that values.ndim and ndim are consistent if and only if
|
||
|
the class variable '_validate_ndim' is True.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
values : array-like
|
||
|
ndim : int or None
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ndim : int
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
ValueError : the number of dimensions do not match
|
||
|
"""
|
||
|
if ndim is None:
|
||
|
ndim = values.ndim
|
||
|
|
||
|
if self._validate_ndim and values.ndim != ndim:
|
||
|
msg = ("Wrong number of dimensions. values.ndim != ndim "
|
||
|
"[{} != {}]")
|
||
|
raise ValueError(msg.format(values.ndim, ndim))
|
||
|
|
||
|
return ndim
|
||
|
|
||
|
@property
|
||
|
def _holder(self):
|
||
|
"""The array-like that can hold the underlying values.
|
||
|
|
||
|
None for 'Block', overridden by subclasses that don't
|
||
|
use an ndarray.
|
||
|
"""
|
||
|
return None
|
||
|
|
||
|
@property
|
||
|
def _consolidate_key(self):
|
||
|
return (self._can_consolidate, self.dtype.name)
|
||
|
|
||
|
@property
|
||
|
def _is_single_block(self):
|
||
|
return self.ndim == 1
|
||
|
|
||
|
@property
|
||
|
def is_view(self):
|
||
|
""" return a boolean if I am possibly a view """
|
||
|
return self.values.base is not None
|
||
|
|
||
|
@property
|
||
|
def is_datelike(self):
|
||
|
""" return True if I am a non-datelike """
|
||
|
return self.is_datetime or self.is_timedelta
|
||
|
|
||
|
def is_categorical_astype(self, dtype):
|
||
|
"""
|
||
|
validate that we have a astypeable to categorical,
|
||
|
returns a boolean if we are a categorical
|
||
|
"""
|
||
|
if dtype is Categorical or dtype is CategoricalDtype:
|
||
|
# this is a pd.Categorical, but is not
|
||
|
# a valid type for astypeing
|
||
|
raise TypeError("invalid type {0} for astype".format(dtype))
|
||
|
|
||
|
elif is_categorical_dtype(dtype):
|
||
|
return True
|
||
|
|
||
|
return False
|
||
|
|
||
|
def external_values(self, dtype=None):
|
||
|
""" return an outside world format, currently just the ndarray """
|
||
|
return self.values
|
||
|
|
||
|
def internal_values(self, dtype=None):
|
||
|
""" return an internal format, currently just the ndarray
|
||
|
this should be the pure internal API format
|
||
|
"""
|
||
|
return self.values
|
||
|
|
||
|
def formatting_values(self):
|
||
|
"""Return the internal values used by the DataFrame/SeriesFormatter"""
|
||
|
return self.internal_values()
|
||
|
|
||
|
def get_values(self, dtype=None):
|
||
|
"""
|
||
|
return an internal format, currently just the ndarray
|
||
|
this is often overridden to handle to_dense like operations
|
||
|
"""
|
||
|
if is_object_dtype(dtype):
|
||
|
return self.values.astype(object)
|
||
|
return self.values
|
||
|
|
||
|
def to_dense(self):
|
||
|
return self.values.view()
|
||
|
|
||
|
@property
|
||
|
def _na_value(self):
|
||
|
return np.nan
|
||
|
|
||
|
@property
|
||
|
def fill_value(self):
|
||
|
return np.nan
|
||
|
|
||
|
@property
|
||
|
def mgr_locs(self):
|
||
|
return self._mgr_locs
|
||
|
|
||
|
@mgr_locs.setter
|
||
|
def mgr_locs(self, new_mgr_locs):
|
||
|
if not isinstance(new_mgr_locs, BlockPlacement):
|
||
|
new_mgr_locs = BlockPlacement(new_mgr_locs)
|
||
|
|
||
|
self._mgr_locs = new_mgr_locs
|
||
|
|
||
|
@property
|
||
|
def array_dtype(self):
|
||
|
""" the dtype to return if I want to construct this block as an
|
||
|
array
|
||
|
"""
|
||
|
return self.dtype
|
||
|
|
||
|
def make_block(self, values, placement=None, ndim=None):
|
||
|
"""
|
||
|
Create a new block, with type inference propagate any values that are
|
||
|
not specified
|
||
|
"""
|
||
|
if placement is None:
|
||
|
placement = self.mgr_locs
|
||
|
if ndim is None:
|
||
|
ndim = self.ndim
|
||
|
|
||
|
return make_block(values, placement=placement, ndim=ndim)
|
||
|
|
||
|
def make_block_scalar(self, values):
|
||
|
"""
|
||
|
Create a ScalarBlock
|
||
|
"""
|
||
|
return ScalarBlock(values)
|
||
|
|
||
|
def make_block_same_class(self, values, placement=None, ndim=None,
|
||
|
dtype=None):
|
||
|
""" Wrap given values in a block of same type as self. """
|
||
|
if dtype is not None:
|
||
|
# issue 19431 fastparquet is passing this
|
||
|
warnings.warn("dtype argument is deprecated, will be removed "
|
||
|
"in a future release.", DeprecationWarning)
|
||
|
if placement is None:
|
||
|
placement = self.mgr_locs
|
||
|
return make_block(values, placement=placement, ndim=ndim,
|
||
|
klass=self.__class__, dtype=dtype)
|
||
|
|
||
|
def __unicode__(self):
|
||
|
|
||
|
# don't want to print out all of the items here
|
||
|
name = pprint_thing(self.__class__.__name__)
|
||
|
if self._is_single_block:
|
||
|
|
||
|
result = '{name}: {len} dtype: {dtype}'.format(
|
||
|
name=name, len=len(self), dtype=self.dtype)
|
||
|
|
||
|
else:
|
||
|
|
||
|
shape = ' x '.join(pprint_thing(s) for s in self.shape)
|
||
|
result = '{name}: {index}, {shape}, dtype: {dtype}'.format(
|
||
|
name=name, index=pprint_thing(self.mgr_locs.indexer),
|
||
|
shape=shape, dtype=self.dtype)
|
||
|
|
||
|
return result
|
||
|
|
||
|
def __len__(self):
|
||
|
return len(self.values)
|
||
|
|
||
|
def __getstate__(self):
|
||
|
return self.mgr_locs.indexer, self.values
|
||
|
|
||
|
def __setstate__(self, state):
|
||
|
self.mgr_locs = BlockPlacement(state[0])
|
||
|
self.values = state[1]
|
||
|
self.ndim = self.values.ndim
|
||
|
|
||
|
def _slice(self, slicer):
|
||
|
""" return a slice of my values """
|
||
|
return self.values[slicer]
|
||
|
|
||
|
def reshape_nd(self, labels, shape, ref_items, mgr=None):
|
||
|
"""
|
||
|
Parameters
|
||
|
----------
|
||
|
labels : list of new axis labels
|
||
|
shape : new shape
|
||
|
ref_items : new ref_items
|
||
|
|
||
|
return a new block that is transformed to a nd block
|
||
|
"""
|
||
|
return _block2d_to_blocknd(values=self.get_values().T,
|
||
|
placement=self.mgr_locs, shape=shape,
|
||
|
labels=labels, ref_items=ref_items)
|
||
|
|
||
|
def getitem_block(self, slicer, new_mgr_locs=None):
|
||
|
"""
|
||
|
Perform __getitem__-like, return result as block.
|
||
|
|
||
|
As of now, only supports slices that preserve dimensionality.
|
||
|
"""
|
||
|
if new_mgr_locs is None:
|
||
|
if isinstance(slicer, tuple):
|
||
|
axis0_slicer = slicer[0]
|
||
|
else:
|
||
|
axis0_slicer = slicer
|
||
|
new_mgr_locs = self.mgr_locs[axis0_slicer]
|
||
|
|
||
|
new_values = self._slice(slicer)
|
||
|
|
||
|
if self._validate_ndim and new_values.ndim != self.ndim:
|
||
|
raise ValueError("Only same dim slicing is allowed")
|
||
|
|
||
|
return self.make_block_same_class(new_values, new_mgr_locs)
|
||
|
|
||
|
@property
|
||
|
def shape(self):
|
||
|
return self.values.shape
|
||
|
|
||
|
@property
|
||
|
def dtype(self):
|
||
|
return self.values.dtype
|
||
|
|
||
|
@property
|
||
|
def ftype(self):
|
||
|
return "{dtype}:{ftype}".format(dtype=self.dtype, ftype=self._ftype)
|
||
|
|
||
|
def merge(self, other):
|
||
|
return _merge_blocks([self, other])
|
||
|
|
||
|
def concat_same_type(self, to_concat, placement=None):
|
||
|
"""
|
||
|
Concatenate list of single blocks of the same type.
|
||
|
"""
|
||
|
values = self._concatenator([blk.values for blk in to_concat],
|
||
|
axis=self.ndim - 1)
|
||
|
return self.make_block_same_class(
|
||
|
values, placement=placement or slice(0, len(values), 1))
|
||
|
|
||
|
def iget(self, i):
|
||
|
return self.values[i]
|
||
|
|
||
|
def set(self, locs, values, check=False):
|
||
|
"""
|
||
|
Modify Block in-place with new item value
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
None
|
||
|
"""
|
||
|
self.values[locs] = values
|
||
|
|
||
|
def delete(self, loc):
|
||
|
"""
|
||
|
Delete given loc(-s) from block in-place.
|
||
|
"""
|
||
|
self.values = np.delete(self.values, loc, 0)
|
||
|
self.mgr_locs = self.mgr_locs.delete(loc)
|
||
|
|
||
|
def apply(self, func, mgr=None, **kwargs):
|
||
|
""" apply the function to my values; return a block if we are not
|
||
|
one
|
||
|
"""
|
||
|
with np.errstate(all='ignore'):
|
||
|
result = func(self.values, **kwargs)
|
||
|
if not isinstance(result, Block):
|
||
|
result = self.make_block(values=_block_shape(result,
|
||
|
ndim=self.ndim))
|
||
|
|
||
|
return result
|
||
|
|
||
|
def fillna(self, value, limit=None, inplace=False, downcast=None,
|
||
|
mgr=None):
|
||
|
""" fillna on the block with the value. If we fail, then convert to
|
||
|
ObjectBlock and try again
|
||
|
"""
|
||
|
inplace = validate_bool_kwarg(inplace, 'inplace')
|
||
|
|
||
|
if not self._can_hold_na:
|
||
|
if inplace:
|
||
|
return self
|
||
|
else:
|
||
|
return self.copy()
|
||
|
|
||
|
mask = isna(self.values)
|
||
|
if limit is not None:
|
||
|
if not is_integer(limit):
|
||
|
raise ValueError('Limit must be an integer')
|
||
|
if limit < 1:
|
||
|
raise ValueError('Limit must be greater than 0')
|
||
|
if self.ndim > 2:
|
||
|
raise NotImplementedError("number of dimensions for 'fillna' "
|
||
|
"is currently limited to 2")
|
||
|
mask[mask.cumsum(self.ndim - 1) > limit] = False
|
||
|
|
||
|
# fillna, but if we cannot coerce, then try again as an ObjectBlock
|
||
|
try:
|
||
|
values, _, _, _ = self._try_coerce_args(self.values, value)
|
||
|
blocks = self.putmask(mask, value, inplace=inplace)
|
||
|
blocks = [b.make_block(values=self._try_coerce_result(b.values))
|
||
|
for b in blocks]
|
||
|
return self._maybe_downcast(blocks, downcast)
|
||
|
except (TypeError, ValueError):
|
||
|
|
||
|
# we can't process the value, but nothing to do
|
||
|
if not mask.any():
|
||
|
return self if inplace else self.copy()
|
||
|
|
||
|
# operate column-by-column
|
||
|
def f(m, v, i):
|
||
|
block = self.coerce_to_target_dtype(value)
|
||
|
|
||
|
# slice out our block
|
||
|
if i is not None:
|
||
|
block = block.getitem_block(slice(i, i + 1))
|
||
|
return block.fillna(value,
|
||
|
limit=limit,
|
||
|
inplace=inplace,
|
||
|
downcast=None)
|
||
|
|
||
|
return self.split_and_operate(mask, f, inplace)
|
||
|
|
||
|
def split_and_operate(self, mask, f, inplace):
|
||
|
"""
|
||
|
split the block per-column, and apply the callable f
|
||
|
per-column, return a new block for each. Handle
|
||
|
masking which will not change a block unless needed.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
mask : 2-d boolean mask
|
||
|
f : callable accepting (1d-mask, 1d values, indexer)
|
||
|
inplace : boolean
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of blocks
|
||
|
"""
|
||
|
|
||
|
if mask is None:
|
||
|
mask = np.ones(self.shape, dtype=bool)
|
||
|
new_values = self.values
|
||
|
|
||
|
def make_a_block(nv, ref_loc):
|
||
|
if isinstance(nv, Block):
|
||
|
block = nv
|
||
|
elif isinstance(nv, list):
|
||
|
block = nv[0]
|
||
|
else:
|
||
|
# Put back the dimension that was taken from it and make
|
||
|
# a block out of the result.
|
||
|
try:
|
||
|
nv = _block_shape(nv, ndim=self.ndim)
|
||
|
except (AttributeError, NotImplementedError):
|
||
|
pass
|
||
|
block = self.make_block(values=nv,
|
||
|
placement=ref_loc)
|
||
|
return block
|
||
|
|
||
|
# ndim == 1
|
||
|
if self.ndim == 1:
|
||
|
if mask.any():
|
||
|
nv = f(mask, new_values, None)
|
||
|
else:
|
||
|
nv = new_values if inplace else new_values.copy()
|
||
|
block = make_a_block(nv, self.mgr_locs)
|
||
|
return [block]
|
||
|
|
||
|
# ndim > 1
|
||
|
new_blocks = []
|
||
|
for i, ref_loc in enumerate(self.mgr_locs):
|
||
|
m = mask[i]
|
||
|
v = new_values[i]
|
||
|
|
||
|
# need a new block
|
||
|
if m.any():
|
||
|
nv = f(m, v, i)
|
||
|
else:
|
||
|
nv = v if inplace else v.copy()
|
||
|
|
||
|
block = make_a_block(nv, [ref_loc])
|
||
|
new_blocks.append(block)
|
||
|
|
||
|
return new_blocks
|
||
|
|
||
|
def _maybe_downcast(self, blocks, downcast=None):
|
||
|
|
||
|
# no need to downcast our float
|
||
|
# unless indicated
|
||
|
if downcast is None and self.is_float:
|
||
|
return blocks
|
||
|
elif downcast is None and (self.is_timedelta or self.is_datetime):
|
||
|
return blocks
|
||
|
|
||
|
if not isinstance(blocks, list):
|
||
|
blocks = [blocks]
|
||
|
return _extend_blocks([b.downcast(downcast) for b in blocks])
|
||
|
|
||
|
def downcast(self, dtypes=None, mgr=None):
|
||
|
""" try to downcast each item to the dict of dtypes if present """
|
||
|
|
||
|
# turn it off completely
|
||
|
if dtypes is False:
|
||
|
return self
|
||
|
|
||
|
values = self.values
|
||
|
|
||
|
# single block handling
|
||
|
if self._is_single_block:
|
||
|
|
||
|
# try to cast all non-floats here
|
||
|
if dtypes is None:
|
||
|
dtypes = 'infer'
|
||
|
|
||
|
nv = maybe_downcast_to_dtype(values, dtypes)
|
||
|
return self.make_block(nv)
|
||
|
|
||
|
# ndim > 1
|
||
|
if dtypes is None:
|
||
|
return self
|
||
|
|
||
|
if not (dtypes == 'infer' or isinstance(dtypes, dict)):
|
||
|
raise ValueError("downcast must have a dictionary or 'infer' as "
|
||
|
"its argument")
|
||
|
|
||
|
# operate column-by-column
|
||
|
# this is expensive as it splits the blocks items-by-item
|
||
|
def f(m, v, i):
|
||
|
|
||
|
if dtypes == 'infer':
|
||
|
dtype = 'infer'
|
||
|
else:
|
||
|
raise AssertionError("dtypes as dict is not supported yet")
|
||
|
|
||
|
if dtype is not None:
|
||
|
v = maybe_downcast_to_dtype(v, dtype)
|
||
|
return v
|
||
|
|
||
|
return self.split_and_operate(None, f, False)
|
||
|
|
||
|
def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
|
||
|
return self._astype(dtype, copy=copy, errors=errors, values=values,
|
||
|
**kwargs)
|
||
|
|
||
|
def _astype(self, dtype, copy=False, errors='raise', values=None,
|
||
|
klass=None, mgr=None, **kwargs):
|
||
|
"""Coerce to the new type
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
dtype : str, dtype convertible
|
||
|
copy : boolean, default False
|
||
|
copy if indicated
|
||
|
errors : str, {'raise', 'ignore'}, default 'ignore'
|
||
|
- ``raise`` : allow exceptions to be raised
|
||
|
- ``ignore`` : suppress exceptions. On error return original object
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Block
|
||
|
"""
|
||
|
errors_legal_values = ('raise', 'ignore')
|
||
|
|
||
|
if errors not in errors_legal_values:
|
||
|
invalid_arg = ("Expected value of kwarg 'errors' to be one of {}. "
|
||
|
"Supplied value is '{}'".format(
|
||
|
list(errors_legal_values), errors))
|
||
|
raise ValueError(invalid_arg)
|
||
|
|
||
|
if (inspect.isclass(dtype) and
|
||
|
issubclass(dtype, (PandasExtensionDtype, ExtensionDtype))):
|
||
|
msg = ("Expected an instance of {}, but got the class instead. "
|
||
|
"Try instantiating 'dtype'.".format(dtype.__name__))
|
||
|
raise TypeError(msg)
|
||
|
|
||
|
# may need to convert to categorical
|
||
|
if self.is_categorical_astype(dtype):
|
||
|
|
||
|
# deprecated 17636
|
||
|
if ('categories' in kwargs or 'ordered' in kwargs):
|
||
|
if isinstance(dtype, CategoricalDtype):
|
||
|
raise TypeError(
|
||
|
"Cannot specify a CategoricalDtype and also "
|
||
|
"`categories` or `ordered`. Use "
|
||
|
"`dtype=CategoricalDtype(categories, ordered)`"
|
||
|
" instead.")
|
||
|
warnings.warn("specifying 'categories' or 'ordered' in "
|
||
|
".astype() is deprecated; pass a "
|
||
|
"CategoricalDtype instead",
|
||
|
FutureWarning, stacklevel=7)
|
||
|
|
||
|
categories = kwargs.get('categories', None)
|
||
|
ordered = kwargs.get('ordered', None)
|
||
|
if com._any_not_none(categories, ordered):
|
||
|
dtype = CategoricalDtype(categories, ordered)
|
||
|
|
||
|
if is_categorical_dtype(self.values):
|
||
|
# GH 10696/18593: update an existing categorical efficiently
|
||
|
return self.make_block(self.values.astype(dtype, copy=copy))
|
||
|
|
||
|
return self.make_block(Categorical(self.values, dtype=dtype))
|
||
|
|
||
|
# astype processing
|
||
|
dtype = np.dtype(dtype)
|
||
|
if self.dtype == dtype:
|
||
|
if copy:
|
||
|
return self.copy()
|
||
|
return self
|
||
|
|
||
|
if klass is None:
|
||
|
if dtype == np.object_:
|
||
|
klass = ObjectBlock
|
||
|
try:
|
||
|
# force the copy here
|
||
|
if values is None:
|
||
|
|
||
|
if issubclass(dtype.type,
|
||
|
(compat.text_type, compat.string_types)):
|
||
|
|
||
|
# use native type formatting for datetime/tz/timedelta
|
||
|
if self.is_datelike:
|
||
|
values = self.to_native_types()
|
||
|
|
||
|
# astype formatting
|
||
|
else:
|
||
|
values = self.get_values()
|
||
|
|
||
|
else:
|
||
|
values = self.get_values(dtype=dtype)
|
||
|
|
||
|
# _astype_nansafe works fine with 1-d only
|
||
|
values = astype_nansafe(values.ravel(), dtype, copy=True)
|
||
|
values = values.reshape(self.shape)
|
||
|
|
||
|
newb = make_block(values, placement=self.mgr_locs,
|
||
|
klass=klass)
|
||
|
except:
|
||
|
if errors == 'raise':
|
||
|
raise
|
||
|
newb = self.copy() if copy else self
|
||
|
|
||
|
if newb.is_numeric and self.is_numeric:
|
||
|
if newb.shape != self.shape:
|
||
|
raise TypeError(
|
||
|
"cannot set astype for copy = [{copy}] for dtype "
|
||
|
"({dtype} [{itemsize}]) with smaller itemsize than "
|
||
|
"current ({newb_dtype} [{newb_size}])".format(
|
||
|
copy=copy, dtype=self.dtype.name,
|
||
|
itemsize=self.itemsize, newb_dtype=newb.dtype.name,
|
||
|
newb_size=newb.itemsize))
|
||
|
return newb
|
||
|
|
||
|
def convert(self, copy=True, **kwargs):
|
||
|
""" attempt to coerce any object types to better types return a copy
|
||
|
of the block (if copy = True) by definition we are not an ObjectBlock
|
||
|
here!
|
||
|
"""
|
||
|
|
||
|
return self.copy() if copy else self
|
||
|
|
||
|
def _can_hold_element(self, element):
|
||
|
""" require the same dtype as ourselves """
|
||
|
dtype = self.values.dtype.type
|
||
|
tipo = maybe_infer_dtype_type(element)
|
||
|
if tipo is not None:
|
||
|
return issubclass(tipo.type, dtype)
|
||
|
return isinstance(element, dtype)
|
||
|
|
||
|
def _try_cast_result(self, result, dtype=None):
|
||
|
""" try to cast the result to our original type, we may have
|
||
|
roundtripped thru object in the mean-time
|
||
|
"""
|
||
|
if dtype is None:
|
||
|
dtype = self.dtype
|
||
|
|
||
|
if self.is_integer or self.is_bool or self.is_datetime:
|
||
|
pass
|
||
|
elif self.is_float and result.dtype == self.dtype:
|
||
|
|
||
|
# protect against a bool/object showing up here
|
||
|
if isinstance(dtype, compat.string_types) and dtype == 'infer':
|
||
|
return result
|
||
|
if not isinstance(dtype, type):
|
||
|
dtype = dtype.type
|
||
|
if issubclass(dtype, (np.bool_, np.object_)):
|
||
|
if issubclass(dtype, np.bool_):
|
||
|
if isna(result).all():
|
||
|
return result.astype(np.bool_)
|
||
|
else:
|
||
|
result = result.astype(np.object_)
|
||
|
result[result == 1] = True
|
||
|
result[result == 0] = False
|
||
|
return result
|
||
|
else:
|
||
|
return result.astype(np.object_)
|
||
|
|
||
|
return result
|
||
|
|
||
|
# may need to change the dtype here
|
||
|
return maybe_downcast_to_dtype(result, dtype)
|
||
|
|
||
|
def _try_coerce_args(self, values, other):
|
||
|
""" provide coercion to our input arguments """
|
||
|
|
||
|
if np.any(notna(other)) and not self._can_hold_element(other):
|
||
|
# coercion issues
|
||
|
# let higher levels handle
|
||
|
raise TypeError("cannot convert {} to an {}".format(
|
||
|
type(other).__name__,
|
||
|
type(self).__name__.lower().replace('Block', '')))
|
||
|
|
||
|
return values, False, other, False
|
||
|
|
||
|
def _try_coerce_result(self, result):
|
||
|
""" reverse of try_coerce_args """
|
||
|
return result
|
||
|
|
||
|
def _try_coerce_and_cast_result(self, result, dtype=None):
|
||
|
result = self._try_coerce_result(result)
|
||
|
result = self._try_cast_result(result, dtype=dtype)
|
||
|
return result
|
||
|
|
||
|
def to_native_types(self, slicer=None, na_rep='nan', quoting=None,
|
||
|
**kwargs):
|
||
|
""" convert to our native types format, slicing if desired """
|
||
|
|
||
|
values = self.get_values()
|
||
|
|
||
|
if slicer is not None:
|
||
|
values = values[:, slicer]
|
||
|
mask = isna(values)
|
||
|
|
||
|
if not self.is_object and not quoting:
|
||
|
values = values.astype(str)
|
||
|
else:
|
||
|
values = np.array(values, dtype='object')
|
||
|
|
||
|
values[mask] = na_rep
|
||
|
return values
|
||
|
|
||
|
# block actions ####
|
||
|
def copy(self, deep=True, mgr=None):
|
||
|
""" copy constructor """
|
||
|
values = self.values
|
||
|
if deep:
|
||
|
values = values.copy()
|
||
|
return self.make_block_same_class(values)
|
||
|
|
||
|
def replace(self, to_replace, value, inplace=False, filter=None,
|
||
|
regex=False, convert=True, mgr=None):
|
||
|
""" replace the to_replace value with value, possible to create new
|
||
|
blocks here this is just a call to putmask. regex is not used here.
|
||
|
It is used in ObjectBlocks. It is here for API
|
||
|
compatibility.
|
||
|
"""
|
||
|
|
||
|
inplace = validate_bool_kwarg(inplace, 'inplace')
|
||
|
original_to_replace = to_replace
|
||
|
|
||
|
# try to replace, if we raise an error, convert to ObjectBlock and
|
||
|
# retry
|
||
|
try:
|
||
|
values, _, to_replace, _ = self._try_coerce_args(self.values,
|
||
|
to_replace)
|
||
|
mask = missing.mask_missing(values, to_replace)
|
||
|
if filter is not None:
|
||
|
filtered_out = ~self.mgr_locs.isin(filter)
|
||
|
mask[filtered_out.nonzero()[0]] = False
|
||
|
|
||
|
blocks = self.putmask(mask, value, inplace=inplace)
|
||
|
if convert:
|
||
|
blocks = [b.convert(by_item=True, numeric=False,
|
||
|
copy=not inplace) for b in blocks]
|
||
|
return blocks
|
||
|
except (TypeError, ValueError):
|
||
|
|
||
|
# try again with a compatible block
|
||
|
block = self.astype(object)
|
||
|
return block.replace(
|
||
|
to_replace=original_to_replace, value=value, inplace=inplace,
|
||
|
filter=filter, regex=regex, convert=convert)
|
||
|
|
||
|
def _replace_single(self, *args, **kwargs):
|
||
|
""" no-op on a non-ObjectBlock """
|
||
|
return self if kwargs['inplace'] else self.copy()
|
||
|
|
||
|
def setitem(self, indexer, value, mgr=None):
|
||
|
"""Set the value inplace, returning a a maybe different typed block.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
indexer : tuple, list-like, array-like, slice
|
||
|
The subset of self.values to set
|
||
|
value : object
|
||
|
The value being set
|
||
|
mgr : BlockPlacement, optional
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Block
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
`indexer` is a direct slice/positional indexer. `value` must
|
||
|
be a compatible shape.
|
||
|
"""
|
||
|
# coerce None values, if appropriate
|
||
|
if value is None:
|
||
|
if self.is_numeric:
|
||
|
value = np.nan
|
||
|
|
||
|
# coerce if block dtype can store value
|
||
|
values = self.values
|
||
|
try:
|
||
|
values, _, value, _ = self._try_coerce_args(values, value)
|
||
|
# can keep its own dtype
|
||
|
if hasattr(value, 'dtype') and is_dtype_equal(values.dtype,
|
||
|
value.dtype):
|
||
|
dtype = self.dtype
|
||
|
else:
|
||
|
dtype = 'infer'
|
||
|
|
||
|
except (TypeError, ValueError):
|
||
|
# current dtype cannot store value, coerce to common dtype
|
||
|
find_dtype = False
|
||
|
|
||
|
if hasattr(value, 'dtype'):
|
||
|
dtype = value.dtype
|
||
|
find_dtype = True
|
||
|
|
||
|
elif is_scalar(value):
|
||
|
if isna(value):
|
||
|
# NaN promotion is handled in latter path
|
||
|
dtype = False
|
||
|
else:
|
||
|
dtype, _ = infer_dtype_from_scalar(value,
|
||
|
pandas_dtype=True)
|
||
|
find_dtype = True
|
||
|
else:
|
||
|
dtype = 'infer'
|
||
|
|
||
|
if find_dtype:
|
||
|
dtype = find_common_type([values.dtype, dtype])
|
||
|
if not is_dtype_equal(self.dtype, dtype):
|
||
|
b = self.astype(dtype)
|
||
|
return b.setitem(indexer, value, mgr=mgr)
|
||
|
|
||
|
# value must be storeable at this moment
|
||
|
arr_value = np.array(value)
|
||
|
|
||
|
# cast the values to a type that can hold nan (if necessary)
|
||
|
if not self._can_hold_element(value):
|
||
|
dtype, _ = maybe_promote(arr_value.dtype)
|
||
|
values = values.astype(dtype)
|
||
|
|
||
|
transf = (lambda x: x.T) if self.ndim == 2 else (lambda x: x)
|
||
|
values = transf(values)
|
||
|
|
||
|
# length checking
|
||
|
check_setitem_lengths(indexer, value, values)
|
||
|
|
||
|
def _is_scalar_indexer(indexer):
|
||
|
# return True if we are all scalar indexers
|
||
|
|
||
|
if arr_value.ndim == 1:
|
||
|
if not isinstance(indexer, tuple):
|
||
|
indexer = tuple([indexer])
|
||
|
return any(isinstance(idx, np.ndarray) and len(idx) == 0
|
||
|
for idx in indexer)
|
||
|
return False
|
||
|
|
||
|
def _is_empty_indexer(indexer):
|
||
|
# return a boolean if we have an empty indexer
|
||
|
|
||
|
if is_list_like(indexer) and not len(indexer):
|
||
|
return True
|
||
|
if arr_value.ndim == 1:
|
||
|
if not isinstance(indexer, tuple):
|
||
|
indexer = tuple([indexer])
|
||
|
return any(isinstance(idx, np.ndarray) and len(idx) == 0
|
||
|
for idx in indexer)
|
||
|
return False
|
||
|
|
||
|
# empty indexers
|
||
|
# 8669 (empty)
|
||
|
if _is_empty_indexer(indexer):
|
||
|
pass
|
||
|
|
||
|
# setting a single element for each dim and with a rhs that could
|
||
|
# be say a list
|
||
|
# GH 6043
|
||
|
elif _is_scalar_indexer(indexer):
|
||
|
values[indexer] = value
|
||
|
|
||
|
# if we are an exact match (ex-broadcasting),
|
||
|
# then use the resultant dtype
|
||
|
elif (len(arr_value.shape) and
|
||
|
arr_value.shape[0] == values.shape[0] and
|
||
|
np.prod(arr_value.shape) == np.prod(values.shape)):
|
||
|
values[indexer] = value
|
||
|
try:
|
||
|
values = values.astype(arr_value.dtype)
|
||
|
except ValueError:
|
||
|
pass
|
||
|
|
||
|
# set
|
||
|
else:
|
||
|
values[indexer] = value
|
||
|
|
||
|
# coerce and try to infer the dtypes of the result
|
||
|
values = self._try_coerce_and_cast_result(values, dtype)
|
||
|
block = self.make_block(transf(values))
|
||
|
return block
|
||
|
|
||
|
def putmask(self, mask, new, align=True, inplace=False, axis=0,
|
||
|
transpose=False, mgr=None):
|
||
|
""" putmask the data to the block; it is possible that we may create a
|
||
|
new dtype of block
|
||
|
|
||
|
return the resulting block(s)
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
mask : the condition to respect
|
||
|
new : a ndarray/object
|
||
|
align : boolean, perform alignment on other/cond, default is True
|
||
|
inplace : perform inplace modification, default is False
|
||
|
axis : int
|
||
|
transpose : boolean
|
||
|
Set to True if self is stored with axes reversed
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
a list of new blocks, the result of the putmask
|
||
|
"""
|
||
|
|
||
|
new_values = self.values if inplace else self.values.copy()
|
||
|
|
||
|
new = getattr(new, 'values', new)
|
||
|
mask = getattr(mask, 'values', mask)
|
||
|
|
||
|
# if we are passed a scalar None, convert it here
|
||
|
if not is_list_like(new) and isna(new) and not self.is_object:
|
||
|
new = self.fill_value
|
||
|
|
||
|
if self._can_hold_element(new):
|
||
|
_, _, new, _ = self._try_coerce_args(new_values, new)
|
||
|
|
||
|
if transpose:
|
||
|
new_values = new_values.T
|
||
|
|
||
|
# If the default repeat behavior in np.putmask would go in the
|
||
|
# wrong direction, then explicitly repeat and reshape new instead
|
||
|
if getattr(new, 'ndim', 0) >= 1:
|
||
|
if self.ndim - 1 == new.ndim and axis == 1:
|
||
|
new = np.repeat(
|
||
|
new, new_values.shape[-1]).reshape(self.shape)
|
||
|
new = new.astype(new_values.dtype)
|
||
|
|
||
|
# we require exact matches between the len of the
|
||
|
# values we are setting (or is compat). np.putmask
|
||
|
# doesn't check this and will simply truncate / pad
|
||
|
# the output, but we want sane error messages
|
||
|
#
|
||
|
# TODO: this prob needs some better checking
|
||
|
# for 2D cases
|
||
|
if ((is_list_like(new) and
|
||
|
np.any(mask[mask]) and
|
||
|
getattr(new, 'ndim', 1) == 1)):
|
||
|
|
||
|
if not (mask.shape[-1] == len(new) or
|
||
|
mask[mask].shape[-1] == len(new) or
|
||
|
len(new) == 1):
|
||
|
raise ValueError("cannot assign mismatch "
|
||
|
"length to masked array")
|
||
|
|
||
|
np.putmask(new_values, mask, new)
|
||
|
|
||
|
# maybe upcast me
|
||
|
elif mask.any():
|
||
|
if transpose:
|
||
|
mask = mask.T
|
||
|
if isinstance(new, np.ndarray):
|
||
|
new = new.T
|
||
|
axis = new_values.ndim - axis - 1
|
||
|
|
||
|
# Pseudo-broadcast
|
||
|
if getattr(new, 'ndim', 0) >= 1:
|
||
|
if self.ndim - 1 == new.ndim:
|
||
|
new_shape = list(new.shape)
|
||
|
new_shape.insert(axis, 1)
|
||
|
new = new.reshape(tuple(new_shape))
|
||
|
|
||
|
# operate column-by-column
|
||
|
def f(m, v, i):
|
||
|
|
||
|
if i is None:
|
||
|
# ndim==1 case.
|
||
|
n = new
|
||
|
else:
|
||
|
|
||
|
if isinstance(new, np.ndarray):
|
||
|
n = np.squeeze(new[i % new.shape[0]])
|
||
|
else:
|
||
|
n = np.array(new)
|
||
|
|
||
|
# type of the new block
|
||
|
dtype, _ = maybe_promote(n.dtype)
|
||
|
|
||
|
# we need to explicitly astype here to make a copy
|
||
|
n = n.astype(dtype)
|
||
|
|
||
|
nv = _putmask_smart(v, m, n)
|
||
|
return nv
|
||
|
|
||
|
new_blocks = self.split_and_operate(mask, f, inplace)
|
||
|
return new_blocks
|
||
|
|
||
|
if inplace:
|
||
|
return [self]
|
||
|
|
||
|
if transpose:
|
||
|
new_values = new_values.T
|
||
|
|
||
|
return [self.make_block(new_values)]
|
||
|
|
||
|
def coerce_to_target_dtype(self, other):
|
||
|
"""
|
||
|
coerce the current block to a dtype compat for other
|
||
|
we will return a block, possibly object, and not raise
|
||
|
|
||
|
we can also safely try to coerce to the same dtype
|
||
|
and will receive the same block
|
||
|
"""
|
||
|
|
||
|
# if we cannot then coerce to object
|
||
|
dtype, _ = infer_dtype_from(other, pandas_dtype=True)
|
||
|
|
||
|
if is_dtype_equal(self.dtype, dtype):
|
||
|
return self
|
||
|
|
||
|
if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype):
|
||
|
# we don't upcast to bool
|
||
|
return self.astype(object)
|
||
|
|
||
|
elif ((self.is_float or self.is_complex) and
|
||
|
(is_integer_dtype(dtype) or is_float_dtype(dtype))):
|
||
|
# don't coerce float/complex to int
|
||
|
return self
|
||
|
|
||
|
elif (self.is_datetime or
|
||
|
is_datetime64_dtype(dtype) or
|
||
|
is_datetime64tz_dtype(dtype)):
|
||
|
|
||
|
# not a datetime
|
||
|
if not ((is_datetime64_dtype(dtype) or
|
||
|
is_datetime64tz_dtype(dtype)) and self.is_datetime):
|
||
|
return self.astype(object)
|
||
|
|
||
|
# don't upcast timezone with different timezone or no timezone
|
||
|
mytz = getattr(self.dtype, 'tz', None)
|
||
|
othertz = getattr(dtype, 'tz', None)
|
||
|
|
||
|
if str(mytz) != str(othertz):
|
||
|
return self.astype(object)
|
||
|
|
||
|
raise AssertionError("possible recursion in "
|
||
|
"coerce_to_target_dtype: {} {}".format(
|
||
|
self, other))
|
||
|
|
||
|
elif (self.is_timedelta or is_timedelta64_dtype(dtype)):
|
||
|
|
||
|
# not a timedelta
|
||
|
if not (is_timedelta64_dtype(dtype) and self.is_timedelta):
|
||
|
return self.astype(object)
|
||
|
|
||
|
raise AssertionError("possible recursion in "
|
||
|
"coerce_to_target_dtype: {} {}".format(
|
||
|
self, other))
|
||
|
|
||
|
try:
|
||
|
return self.astype(dtype)
|
||
|
except (ValueError, TypeError):
|
||
|
pass
|
||
|
|
||
|
return self.astype(object)
|
||
|
|
||
|
def interpolate(self, method='pad', axis=0, index=None, values=None,
|
||
|
inplace=False, limit=None, limit_direction='forward',
|
||
|
limit_area=None, fill_value=None, coerce=False,
|
||
|
downcast=None, mgr=None, **kwargs):
|
||
|
|
||
|
inplace = validate_bool_kwarg(inplace, 'inplace')
|
||
|
|
||
|
def check_int_bool(self, inplace):
|
||
|
# Only FloatBlocks will contain NaNs.
|
||
|
# timedelta subclasses IntBlock
|
||
|
if (self.is_bool or self.is_integer) and not self.is_timedelta:
|
||
|
if inplace:
|
||
|
return self
|
||
|
else:
|
||
|
return self.copy()
|
||
|
|
||
|
# a fill na type method
|
||
|
try:
|
||
|
m = missing.clean_fill_method(method)
|
||
|
except:
|
||
|
m = None
|
||
|
|
||
|
if m is not None:
|
||
|
r = check_int_bool(self, inplace)
|
||
|
if r is not None:
|
||
|
return r
|
||
|
return self._interpolate_with_fill(method=m, axis=axis,
|
||
|
inplace=inplace, limit=limit,
|
||
|
fill_value=fill_value,
|
||
|
coerce=coerce,
|
||
|
downcast=downcast, mgr=mgr)
|
||
|
# try an interp method
|
||
|
try:
|
||
|
m = missing.clean_interp_method(method, **kwargs)
|
||
|
except:
|
||
|
m = None
|
||
|
|
||
|
if m is not None:
|
||
|
r = check_int_bool(self, inplace)
|
||
|
if r is not None:
|
||
|
return r
|
||
|
return self._interpolate(method=m, index=index, values=values,
|
||
|
axis=axis, limit=limit,
|
||
|
limit_direction=limit_direction,
|
||
|
limit_area=limit_area,
|
||
|
fill_value=fill_value, inplace=inplace,
|
||
|
downcast=downcast, mgr=mgr, **kwargs)
|
||
|
|
||
|
raise ValueError("invalid method '{0}' to interpolate.".format(method))
|
||
|
|
||
|
def _interpolate_with_fill(self, method='pad', axis=0, inplace=False,
|
||
|
limit=None, fill_value=None, coerce=False,
|
||
|
downcast=None, mgr=None):
|
||
|
""" fillna but using the interpolate machinery """
|
||
|
|
||
|
inplace = validate_bool_kwarg(inplace, 'inplace')
|
||
|
|
||
|
# if we are coercing, then don't force the conversion
|
||
|
# if the block can't hold the type
|
||
|
if coerce:
|
||
|
if not self._can_hold_na:
|
||
|
if inplace:
|
||
|
return [self]
|
||
|
else:
|
||
|
return [self.copy()]
|
||
|
|
||
|
values = self.values if inplace else self.values.copy()
|
||
|
values, _, fill_value, _ = self._try_coerce_args(values, fill_value)
|
||
|
values = missing.interpolate_2d(values, method=method, axis=axis,
|
||
|
limit=limit, fill_value=fill_value,
|
||
|
dtype=self.dtype)
|
||
|
values = self._try_coerce_result(values)
|
||
|
|
||
|
blocks = [self.make_block_same_class(values, ndim=self.ndim)]
|
||
|
return self._maybe_downcast(blocks, downcast)
|
||
|
|
||
|
def _interpolate(self, method=None, index=None, values=None,
|
||
|
fill_value=None, axis=0, limit=None,
|
||
|
limit_direction='forward', limit_area=None,
|
||
|
inplace=False, downcast=None, mgr=None, **kwargs):
|
||
|
""" interpolate using scipy wrappers """
|
||
|
|
||
|
inplace = validate_bool_kwarg(inplace, 'inplace')
|
||
|
data = self.values if inplace else self.values.copy()
|
||
|
|
||
|
# only deal with floats
|
||
|
if not self.is_float:
|
||
|
if not self.is_integer:
|
||
|
return self
|
||
|
data = data.astype(np.float64)
|
||
|
|
||
|
if fill_value is None:
|
||
|
fill_value = self.fill_value
|
||
|
|
||
|
if method in ('krogh', 'piecewise_polynomial', 'pchip'):
|
||
|
if not index.is_monotonic:
|
||
|
raise ValueError("{0} interpolation requires that the "
|
||
|
"index be monotonic.".format(method))
|
||
|
# process 1-d slices in the axis direction
|
||
|
|
||
|
def func(x):
|
||
|
|
||
|
# process a 1-d slice, returning it
|
||
|
# should the axis argument be handled below in apply_along_axis?
|
||
|
# i.e. not an arg to missing.interpolate_1d
|
||
|
return missing.interpolate_1d(index, x, method=method, limit=limit,
|
||
|
limit_direction=limit_direction,
|
||
|
limit_area=limit_area,
|
||
|
fill_value=fill_value,
|
||
|
bounds_error=False, **kwargs)
|
||
|
|
||
|
# interp each column independently
|
||
|
interp_values = np.apply_along_axis(func, axis, data)
|
||
|
|
||
|
blocks = [self.make_block_same_class(interp_values)]
|
||
|
return self._maybe_downcast(blocks, downcast)
|
||
|
|
||
|
def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None):
|
||
|
"""
|
||
|
Take values according to indexer and return them as a block.bb
|
||
|
|
||
|
"""
|
||
|
|
||
|
# algos.take_nd dispatches for DatetimeTZBlock, CategoricalBlock
|
||
|
# so need to preserve types
|
||
|
# sparse is treated like an ndarray, but needs .get_values() shaping
|
||
|
|
||
|
values = self.values
|
||
|
if self.is_sparse:
|
||
|
values = self.get_values()
|
||
|
|
||
|
if fill_tuple is None:
|
||
|
fill_value = self.fill_value
|
||
|
new_values = algos.take_nd(values, indexer, axis=axis,
|
||
|
allow_fill=False)
|
||
|
else:
|
||
|
fill_value = fill_tuple[0]
|
||
|
new_values = algos.take_nd(values, indexer, axis=axis,
|
||
|
allow_fill=True, fill_value=fill_value)
|
||
|
|
||
|
if new_mgr_locs is None:
|
||
|
if axis == 0:
|
||
|
slc = libinternals.indexer_as_slice(indexer)
|
||
|
if slc is not None:
|
||
|
new_mgr_locs = self.mgr_locs[slc]
|
||
|
else:
|
||
|
new_mgr_locs = self.mgr_locs[indexer]
|
||
|
else:
|
||
|
new_mgr_locs = self.mgr_locs
|
||
|
|
||
|
if not is_dtype_equal(new_values.dtype, self.dtype):
|
||
|
return self.make_block(new_values, new_mgr_locs)
|
||
|
else:
|
||
|
return self.make_block_same_class(new_values, new_mgr_locs)
|
||
|
|
||
|
def diff(self, n, axis=1, mgr=None):
|
||
|
""" return block for the diff of the values """
|
||
|
new_values = algos.diff(self.values, n, axis=axis)
|
||
|
return [self.make_block(values=new_values)]
|
||
|
|
||
|
def shift(self, periods, axis=0, mgr=None):
|
||
|
""" shift the block by periods, possibly upcast """
|
||
|
|
||
|
# convert integer to float if necessary. need to do a lot more than
|
||
|
# that, handle boolean etc also
|
||
|
new_values, fill_value = maybe_upcast(self.values)
|
||
|
|
||
|
# make sure array sent to np.roll is c_contiguous
|
||
|
f_ordered = new_values.flags.f_contiguous
|
||
|
if f_ordered:
|
||
|
new_values = new_values.T
|
||
|
axis = new_values.ndim - axis - 1
|
||
|
|
||
|
if np.prod(new_values.shape):
|
||
|
new_values = np.roll(new_values, _ensure_platform_int(periods),
|
||
|
axis=axis)
|
||
|
|
||
|
axis_indexer = [slice(None)] * self.ndim
|
||
|
if periods > 0:
|
||
|
axis_indexer[axis] = slice(None, periods)
|
||
|
else:
|
||
|
axis_indexer[axis] = slice(periods, None)
|
||
|
new_values[tuple(axis_indexer)] = fill_value
|
||
|
|
||
|
# restore original order
|
||
|
if f_ordered:
|
||
|
new_values = new_values.T
|
||
|
|
||
|
return [self.make_block(new_values)]
|
||
|
|
||
|
def eval(self, func, other, errors='raise', try_cast=False, mgr=None):
|
||
|
"""
|
||
|
evaluate the block; return result block from the result
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
func : how to combine self, other
|
||
|
other : a ndarray/object
|
||
|
errors : str, {'raise', 'ignore'}, default 'raise'
|
||
|
- ``raise`` : allow exceptions to be raised
|
||
|
- ``ignore`` : suppress exceptions. On error return original object
|
||
|
|
||
|
try_cast : try casting the results to the input type
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
a new block, the result of the func
|
||
|
"""
|
||
|
orig_other = other
|
||
|
values = self.values
|
||
|
|
||
|
other = getattr(other, 'values', other)
|
||
|
|
||
|
# make sure that we can broadcast
|
||
|
is_transposed = False
|
||
|
if hasattr(other, 'ndim') and hasattr(values, 'ndim'):
|
||
|
if values.ndim != other.ndim:
|
||
|
is_transposed = True
|
||
|
else:
|
||
|
if values.shape == other.shape[::-1]:
|
||
|
is_transposed = True
|
||
|
elif values.shape[0] == other.shape[-1]:
|
||
|
is_transposed = True
|
||
|
else:
|
||
|
# this is a broadcast error heree
|
||
|
raise ValueError(
|
||
|
"cannot broadcast shape [{t_shape}] with "
|
||
|
"block values [{oth_shape}]".format(
|
||
|
t_shape=values.T.shape, oth_shape=other.shape))
|
||
|
|
||
|
transf = (lambda x: x.T) if is_transposed else (lambda x: x)
|
||
|
|
||
|
# coerce/transpose the args if needed
|
||
|
try:
|
||
|
values, values_mask, other, other_mask = self._try_coerce_args(
|
||
|
transf(values), other)
|
||
|
except TypeError:
|
||
|
block = self.coerce_to_target_dtype(orig_other)
|
||
|
return block.eval(func, orig_other,
|
||
|
errors=errors,
|
||
|
try_cast=try_cast, mgr=mgr)
|
||
|
|
||
|
# get the result, may need to transpose the other
|
||
|
def get_result(other):
|
||
|
|
||
|
# avoid numpy warning of comparisons again None
|
||
|
if other is None:
|
||
|
result = not func.__name__ == 'eq'
|
||
|
|
||
|
# avoid numpy warning of elementwise comparisons to object
|
||
|
elif is_numeric_v_string_like(values, other):
|
||
|
result = False
|
||
|
|
||
|
# avoid numpy warning of elementwise comparisons
|
||
|
elif func.__name__ == 'eq':
|
||
|
if is_list_like(other) and not isinstance(other, np.ndarray):
|
||
|
other = np.asarray(other)
|
||
|
|
||
|
# if we can broadcast, then ok
|
||
|
if values.shape[-1] != other.shape[-1]:
|
||
|
return False
|
||
|
result = func(values, other)
|
||
|
else:
|
||
|
result = func(values, other)
|
||
|
|
||
|
# mask if needed
|
||
|
if isinstance(values_mask, np.ndarray) and values_mask.any():
|
||
|
result = result.astype('float64', copy=False)
|
||
|
result[values_mask] = np.nan
|
||
|
if other_mask is True:
|
||
|
result = result.astype('float64', copy=False)
|
||
|
result[:] = np.nan
|
||
|
elif isinstance(other_mask, np.ndarray) and other_mask.any():
|
||
|
result = result.astype('float64', copy=False)
|
||
|
result[other_mask.ravel()] = np.nan
|
||
|
|
||
|
return result
|
||
|
|
||
|
# error handler if we have an issue operating with the function
|
||
|
def handle_error():
|
||
|
|
||
|
if errors == 'raise':
|
||
|
# The 'detail' variable is defined in outer scope.
|
||
|
raise TypeError(
|
||
|
'Could not operate {other!r} with block values '
|
||
|
'{detail!s}'.format(other=other, detail=detail)) # noqa
|
||
|
else:
|
||
|
# return the values
|
||
|
result = np.empty(values.shape, dtype='O')
|
||
|
result.fill(np.nan)
|
||
|
return result
|
||
|
|
||
|
# get the result
|
||
|
try:
|
||
|
with np.errstate(all='ignore'):
|
||
|
result = get_result(other)
|
||
|
|
||
|
# if we have an invalid shape/broadcast error
|
||
|
# GH4576, so raise instead of allowing to pass through
|
||
|
except ValueError as detail:
|
||
|
raise
|
||
|
except Exception as detail:
|
||
|
result = handle_error()
|
||
|
|
||
|
# technically a broadcast error in numpy can 'work' by returning a
|
||
|
# boolean False
|
||
|
if not isinstance(result, np.ndarray):
|
||
|
if not isinstance(result, np.ndarray):
|
||
|
|
||
|
# differentiate between an invalid ndarray-ndarray comparison
|
||
|
# and an invalid type comparison
|
||
|
if isinstance(values, np.ndarray) and is_list_like(other):
|
||
|
raise ValueError(
|
||
|
'Invalid broadcasting comparison [{other!r}] with '
|
||
|
'block values'.format(other=other))
|
||
|
|
||
|
raise TypeError('Could not compare [{other!r}] '
|
||
|
'with block values'.format(other=other))
|
||
|
|
||
|
# transpose if needed
|
||
|
result = transf(result)
|
||
|
|
||
|
# try to cast if requested
|
||
|
if try_cast:
|
||
|
result = self._try_cast_result(result)
|
||
|
|
||
|
result = _block_shape(result, ndim=self.ndim)
|
||
|
return [self.make_block(result)]
|
||
|
|
||
|
def where(self, other, cond, align=True, errors='raise',
|
||
|
try_cast=False, axis=0, transpose=False, mgr=None):
|
||
|
"""
|
||
|
evaluate the block; return result block(s) from the result
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
other : a ndarray/object
|
||
|
cond : the condition to respect
|
||
|
align : boolean, perform alignment on other/cond
|
||
|
errors : str, {'raise', 'ignore'}, default 'raise'
|
||
|
- ``raise`` : allow exceptions to be raised
|
||
|
- ``ignore`` : suppress exceptions. On error return original object
|
||
|
|
||
|
axis : int
|
||
|
transpose : boolean
|
||
|
Set to True if self is stored with axes reversed
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
a new block(s), the result of the func
|
||
|
"""
|
||
|
import pandas.core.computation.expressions as expressions
|
||
|
assert errors in ['raise', 'ignore']
|
||
|
|
||
|
values = self.values
|
||
|
orig_other = other
|
||
|
if transpose:
|
||
|
values = values.T
|
||
|
|
||
|
other = getattr(other, 'values', other)
|
||
|
cond = getattr(cond, 'values', cond)
|
||
|
|
||
|
# If the default broadcasting would go in the wrong direction, then
|
||
|
# explicitly reshape other instead
|
||
|
if getattr(other, 'ndim', 0) >= 1:
|
||
|
if values.ndim - 1 == other.ndim and axis == 1:
|
||
|
other = other.reshape(tuple(other.shape + (1, )))
|
||
|
|
||
|
if not hasattr(cond, 'shape'):
|
||
|
raise ValueError("where must have a condition that is ndarray "
|
||
|
"like")
|
||
|
|
||
|
# our where function
|
||
|
def func(cond, values, other):
|
||
|
if cond.ravel().all():
|
||
|
return values
|
||
|
|
||
|
values, values_mask, other, other_mask = self._try_coerce_args(
|
||
|
values, other)
|
||
|
|
||
|
try:
|
||
|
return self._try_coerce_result(expressions.where(
|
||
|
cond, values, other))
|
||
|
except Exception as detail:
|
||
|
if errors == 'raise':
|
||
|
raise TypeError(
|
||
|
'Could not operate [{other!r}] with block values '
|
||
|
'[{detail!s}]'.format(other=other, detail=detail))
|
||
|
else:
|
||
|
# return the values
|
||
|
result = np.empty(values.shape, dtype='float64')
|
||
|
result.fill(np.nan)
|
||
|
return result
|
||
|
|
||
|
# see if we can operate on the entire block, or need item-by-item
|
||
|
# or if we are a single block (ndim == 1)
|
||
|
try:
|
||
|
result = func(cond, values, other)
|
||
|
except TypeError:
|
||
|
|
||
|
# we cannot coerce, return a compat dtype
|
||
|
# we are explicitly ignoring errors
|
||
|
block = self.coerce_to_target_dtype(other)
|
||
|
blocks = block.where(orig_other, cond, align=align,
|
||
|
errors=errors,
|
||
|
try_cast=try_cast, axis=axis,
|
||
|
transpose=transpose)
|
||
|
return self._maybe_downcast(blocks, 'infer')
|
||
|
|
||
|
if self._can_hold_na or self.ndim == 1:
|
||
|
|
||
|
if transpose:
|
||
|
result = result.T
|
||
|
|
||
|
# try to cast if requested
|
||
|
if try_cast:
|
||
|
result = self._try_cast_result(result)
|
||
|
|
||
|
return self.make_block(result)
|
||
|
|
||
|
# might need to separate out blocks
|
||
|
axis = cond.ndim - 1
|
||
|
cond = cond.swapaxes(axis, 0)
|
||
|
mask = np.array([cond[i].all() for i in range(cond.shape[0])],
|
||
|
dtype=bool)
|
||
|
|
||
|
result_blocks = []
|
||
|
for m in [mask, ~mask]:
|
||
|
if m.any():
|
||
|
r = self._try_cast_result(result.take(m.nonzero()[0],
|
||
|
axis=axis))
|
||
|
result_blocks.append(
|
||
|
self.make_block(r.T, placement=self.mgr_locs[m]))
|
||
|
|
||
|
return result_blocks
|
||
|
|
||
|
def equals(self, other):
|
||
|
if self.dtype != other.dtype or self.shape != other.shape:
|
||
|
return False
|
||
|
return array_equivalent(self.values, other.values)
|
||
|
|
||
|
def _unstack(self, unstacker_func, new_columns):
|
||
|
"""Return a list of unstacked blocks of self
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
unstacker_func : callable
|
||
|
Partially applied unstacker.
|
||
|
new_columns : Index
|
||
|
All columns of the unstacked BlockManager.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
blocks : list of Block
|
||
|
New blocks of unstacked values.
|
||
|
mask : array_like of bool
|
||
|
The mask of columns of `blocks` we should keep.
|
||
|
"""
|
||
|
unstacker = unstacker_func(self.values.T)
|
||
|
new_items = unstacker.get_new_columns()
|
||
|
new_placement = new_columns.get_indexer(new_items)
|
||
|
new_values, mask = unstacker.get_new_values()
|
||
|
|
||
|
mask = mask.any(0)
|
||
|
new_values = new_values.T[mask]
|
||
|
new_placement = new_placement[mask]
|
||
|
|
||
|
blocks = [make_block(new_values, placement=new_placement)]
|
||
|
return blocks, mask
|
||
|
|
||
|
def quantile(self, qs, interpolation='linear', axis=0, mgr=None):
|
||
|
"""
|
||
|
compute the quantiles of the
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
qs: a scalar or list of the quantiles to be computed
|
||
|
interpolation: type of interpolation, default 'linear'
|
||
|
axis: axis to compute, default 0
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
tuple of (axis, block)
|
||
|
|
||
|
"""
|
||
|
kw = {'interpolation': interpolation}
|
||
|
values = self.get_values()
|
||
|
values, _, _, _ = self._try_coerce_args(values, values)
|
||
|
|
||
|
def _nanpercentile1D(values, mask, q, **kw):
|
||
|
values = values[~mask]
|
||
|
|
||
|
if len(values) == 0:
|
||
|
if is_scalar(q):
|
||
|
return self._na_value
|
||
|
else:
|
||
|
return np.array([self._na_value] * len(q),
|
||
|
dtype=values.dtype)
|
||
|
|
||
|
return np.percentile(values, q, **kw)
|
||
|
|
||
|
def _nanpercentile(values, q, axis, **kw):
|
||
|
|
||
|
mask = isna(self.values)
|
||
|
if not is_scalar(mask) and mask.any():
|
||
|
if self.ndim == 1:
|
||
|
return _nanpercentile1D(values, mask, q, **kw)
|
||
|
else:
|
||
|
# for nonconsolidatable blocks mask is 1D, but values 2D
|
||
|
if mask.ndim < values.ndim:
|
||
|
mask = mask.reshape(values.shape)
|
||
|
if axis == 0:
|
||
|
values = values.T
|
||
|
mask = mask.T
|
||
|
result = [_nanpercentile1D(val, m, q, **kw) for (val, m)
|
||
|
in zip(list(values), list(mask))]
|
||
|
result = np.array(result, dtype=values.dtype, copy=False).T
|
||
|
return result
|
||
|
else:
|
||
|
return np.percentile(values, q, axis=axis, **kw)
|
||
|
|
||
|
from pandas import Float64Index
|
||
|
is_empty = values.shape[axis] == 0
|
||
|
if is_list_like(qs):
|
||
|
ax = Float64Index(qs)
|
||
|
|
||
|
if is_empty:
|
||
|
if self.ndim == 1:
|
||
|
result = self._na_value
|
||
|
else:
|
||
|
# create the array of na_values
|
||
|
# 2d len(values) * len(qs)
|
||
|
result = np.repeat(np.array([self._na_value] * len(qs)),
|
||
|
len(values)).reshape(len(values),
|
||
|
len(qs))
|
||
|
else:
|
||
|
|
||
|
try:
|
||
|
result = _nanpercentile(values, np.array(qs) * 100,
|
||
|
axis=axis, **kw)
|
||
|
except ValueError:
|
||
|
|
||
|
# older numpies don't handle an array for q
|
||
|
result = [_nanpercentile(values, q * 100,
|
||
|
axis=axis, **kw) for q in qs]
|
||
|
|
||
|
result = np.array(result, copy=False)
|
||
|
if self.ndim > 1:
|
||
|
result = result.T
|
||
|
|
||
|
else:
|
||
|
|
||
|
if self.ndim == 1:
|
||
|
ax = Float64Index([qs])
|
||
|
else:
|
||
|
ax = mgr.axes[0]
|
||
|
|
||
|
if is_empty:
|
||
|
if self.ndim == 1:
|
||
|
result = self._na_value
|
||
|
else:
|
||
|
result = np.array([self._na_value] * len(self))
|
||
|
else:
|
||
|
result = _nanpercentile(values, qs * 100, axis=axis, **kw)
|
||
|
|
||
|
ndim = getattr(result, 'ndim', None) or 0
|
||
|
result = self._try_coerce_result(result)
|
||
|
if is_scalar(result):
|
||
|
return ax, self.make_block_scalar(result)
|
||
|
return ax, make_block(result,
|
||
|
placement=np.arange(len(result)),
|
||
|
ndim=ndim)
|
||
|
|
||
|
|
||
|
class ScalarBlock(Block):
|
||
|
"""
|
||
|
a scalar compat Block
|
||
|
"""
|
||
|
__slots__ = ['_mgr_locs', 'values', 'ndim']
|
||
|
|
||
|
def __init__(self, values):
|
||
|
self.ndim = 0
|
||
|
self.mgr_locs = [0]
|
||
|
self.values = values
|
||
|
|
||
|
@property
|
||
|
def dtype(self):
|
||
|
return type(self.values)
|
||
|
|
||
|
@property
|
||
|
def shape(self):
|
||
|
return tuple([0])
|
||
|
|
||
|
def __len__(self):
|
||
|
return 0
|
||
|
|
||
|
|
||
|
class NonConsolidatableMixIn(object):
|
||
|
""" hold methods for the nonconsolidatable blocks """
|
||
|
_can_consolidate = False
|
||
|
_verify_integrity = False
|
||
|
_validate_ndim = False
|
||
|
|
||
|
def __init__(self, values, placement, ndim=None):
|
||
|
"""Initialize a non-consolidatable block.
|
||
|
|
||
|
'ndim' may be inferred from 'placement'.
|
||
|
|
||
|
This will call continue to call __init__ for the other base
|
||
|
classes mixed in with this Mixin.
|
||
|
"""
|
||
|
# Placement must be converted to BlockPlacement so that we can check
|
||
|
# its length
|
||
|
if not isinstance(placement, BlockPlacement):
|
||
|
placement = BlockPlacement(placement)
|
||
|
|
||
|
# Maybe infer ndim from placement
|
||
|
if ndim is None:
|
||
|
if len(placement) != 1:
|
||
|
ndim = 1
|
||
|
else:
|
||
|
ndim = 2
|
||
|
super(NonConsolidatableMixIn, self).__init__(values, placement,
|
||
|
ndim=ndim)
|
||
|
|
||
|
@property
|
||
|
def shape(self):
|
||
|
if self.ndim == 1:
|
||
|
return (len(self.values)),
|
||
|
return (len(self.mgr_locs), len(self.values))
|
||
|
|
||
|
def get_values(self, dtype=None):
|
||
|
""" need to to_dense myself (and always return a ndim sized object) """
|
||
|
values = self.values.to_dense()
|
||
|
if values.ndim == self.ndim - 1:
|
||
|
values = values.reshape((1,) + values.shape)
|
||
|
return values
|
||
|
|
||
|
def iget(self, col):
|
||
|
|
||
|
if self.ndim == 2 and isinstance(col, tuple):
|
||
|
col, loc = col
|
||
|
if not com.is_null_slice(col) and col != 0:
|
||
|
raise IndexError("{0} only contains one item".format(self))
|
||
|
return self.values[loc]
|
||
|
else:
|
||
|
if col != 0:
|
||
|
raise IndexError("{0} only contains one item".format(self))
|
||
|
return self.values
|
||
|
|
||
|
def should_store(self, value):
|
||
|
return isinstance(value, self._holder)
|
||
|
|
||
|
def set(self, locs, values, check=False):
|
||
|
assert locs.tolist() == [0]
|
||
|
self.values = values
|
||
|
|
||
|
def putmask(self, mask, new, align=True, inplace=False, axis=0,
|
||
|
transpose=False, mgr=None):
|
||
|
"""
|
||
|
putmask the data to the block; we must be a single block and not
|
||
|
generate other blocks
|
||
|
|
||
|
return the resulting block
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
mask : the condition to respect
|
||
|
new : a ndarray/object
|
||
|
align : boolean, perform alignment on other/cond, default is True
|
||
|
inplace : perform inplace modification, default is False
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
a new block, the result of the putmask
|
||
|
"""
|
||
|
inplace = validate_bool_kwarg(inplace, 'inplace')
|
||
|
|
||
|
# use block's copy logic.
|
||
|
# .values may be an Index which does shallow copy by default
|
||
|
new_values = self.values if inplace else self.copy().values
|
||
|
new_values, _, new, _ = self._try_coerce_args(new_values, new)
|
||
|
|
||
|
if isinstance(new, np.ndarray) and len(new) == len(mask):
|
||
|
new = new[mask]
|
||
|
|
||
|
mask = _safe_reshape(mask, new_values.shape)
|
||
|
|
||
|
new_values[mask] = new
|
||
|
new_values = self._try_coerce_result(new_values)
|
||
|
return [self.make_block(values=new_values)]
|
||
|
|
||
|
def _slice(self, slicer):
|
||
|
""" return a slice of my values (but densify first) """
|
||
|
return self.get_values()[slicer]
|
||
|
|
||
|
def _try_cast_result(self, result, dtype=None):
|
||
|
return result
|
||
|
|
||
|
def _unstack(self, unstacker_func, new_columns):
|
||
|
"""Return a list of unstacked blocks of self
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
unstacker_func : callable
|
||
|
Partially applied unstacker.
|
||
|
new_columns : Index
|
||
|
All columns of the unstacked BlockManager.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
blocks : list of Block
|
||
|
New blocks of unstacked values.
|
||
|
mask : array_like of bool
|
||
|
The mask of columns of `blocks` we should keep.
|
||
|
"""
|
||
|
# NonConsolidatable blocks can have a single item only, so we return
|
||
|
# one block per item
|
||
|
unstacker = unstacker_func(self.values.T)
|
||
|
new_items = unstacker.get_new_columns()
|
||
|
new_placement = new_columns.get_indexer(new_items)
|
||
|
new_values, mask = unstacker.get_new_values()
|
||
|
|
||
|
mask = mask.any(0)
|
||
|
new_values = new_values.T[mask]
|
||
|
new_placement = new_placement[mask]
|
||
|
|
||
|
blocks = [self.make_block_same_class(vals, [place])
|
||
|
for vals, place in zip(new_values, new_placement)]
|
||
|
return blocks, mask
|
||
|
|
||
|
|
||
|
class ExtensionBlock(NonConsolidatableMixIn, Block):
|
||
|
"""Block for holding extension types.
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This holds all 3rd-party extension array types. It's also the immediate
|
||
|
parent class for our internal extension types' blocks, CategoricalBlock.
|
||
|
|
||
|
ExtensionArrays are limited to 1-D.
|
||
|
"""
|
||
|
is_extension = True
|
||
|
|
||
|
def __init__(self, values, placement, ndim=None):
|
||
|
values = self._maybe_coerce_values(values)
|
||
|
super(ExtensionBlock, self).__init__(values, placement, ndim)
|
||
|
|
||
|
def _maybe_coerce_values(self, values):
|
||
|
"""Unbox to an extension array.
|
||
|
|
||
|
This will unbox an ExtensionArray stored in an Index or Series.
|
||
|
ExtensionArrays pass through. No dtype coercion is done.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
values : Index, Series, ExtensionArray
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
ExtensionArray
|
||
|
"""
|
||
|
if isinstance(values, (ABCIndexClass, ABCSeries)):
|
||
|
values = values._values
|
||
|
return values
|
||
|
|
||
|
@property
|
||
|
def _holder(self):
|
||
|
# For extension blocks, the holder is values-dependent.
|
||
|
return type(self.values)
|
||
|
|
||
|
@property
|
||
|
def fill_value(self):
|
||
|
# Used in reindex_indexer
|
||
|
return self.values.dtype.na_value
|
||
|
|
||
|
@property
|
||
|
def _can_hold_na(self):
|
||
|
# The default ExtensionArray._can_hold_na is True
|
||
|
return self._holder._can_hold_na
|
||
|
|
||
|
@property
|
||
|
def is_view(self):
|
||
|
"""Extension arrays are never treated as views."""
|
||
|
return False
|
||
|
|
||
|
def setitem(self, indexer, value, mgr=None):
|
||
|
"""Set the value inplace, returning a same-typed block.
|
||
|
|
||
|
This differs from Block.setitem by not allowing setitem to change
|
||
|
the dtype of the Block.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
indexer : tuple, list-like, array-like, slice
|
||
|
The subset of self.values to set
|
||
|
value : object
|
||
|
The value being set
|
||
|
mgr : BlockPlacement, optional
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Block
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
`indexer` is a direct slice/positional indexer. `value` must
|
||
|
be a compatible shape.
|
||
|
"""
|
||
|
if isinstance(indexer, tuple):
|
||
|
# we are always 1-D
|
||
|
indexer = indexer[0]
|
||
|
|
||
|
check_setitem_lengths(indexer, value, self.values)
|
||
|
self.values[indexer] = value
|
||
|
return self
|
||
|
|
||
|
def get_values(self, dtype=None):
|
||
|
# ExtensionArrays must be iterable, so this works.
|
||
|
values = np.asarray(self.values)
|
||
|
if values.ndim == self.ndim - 1:
|
||
|
values = values.reshape((1,) + values.shape)
|
||
|
return values
|
||
|
|
||
|
def to_dense(self):
|
||
|
return np.asarray(self.values)
|
||
|
|
||
|
def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None):
|
||
|
"""
|
||
|
Take values according to indexer and return them as a block.
|
||
|
"""
|
||
|
if fill_tuple is None:
|
||
|
fill_value = None
|
||
|
else:
|
||
|
fill_value = fill_tuple[0]
|
||
|
|
||
|
# axis doesn't matter; we are really a single-dim object
|
||
|
# but are passed the axis depending on the calling routing
|
||
|
# if its REALLY axis 0, then this will be a reindex and not a take
|
||
|
new_values = self.values.take(indexer, fill_value=fill_value,
|
||
|
allow_fill=True)
|
||
|
|
||
|
# if we are a 1-dim object, then always place at 0
|
||
|
if self.ndim == 1:
|
||
|
new_mgr_locs = [0]
|
||
|
else:
|
||
|
if new_mgr_locs is None:
|
||
|
new_mgr_locs = self.mgr_locs
|
||
|
|
||
|
return self.make_block_same_class(new_values, new_mgr_locs)
|
||
|
|
||
|
def _can_hold_element(self, element):
|
||
|
# XXX: We may need to think about pushing this onto the array.
|
||
|
# We're doing the same as CategoricalBlock here.
|
||
|
return True
|
||
|
|
||
|
def _slice(self, slicer):
|
||
|
""" return a slice of my values """
|
||
|
|
||
|
# slice the category
|
||
|
# return same dims as we currently have
|
||
|
|
||
|
if isinstance(slicer, tuple) and len(slicer) == 2:
|
||
|
if not com.is_null_slice(slicer[0]):
|
||
|
raise AssertionError("invalid slicing for a 1-ndim "
|
||
|
"categorical")
|
||
|
slicer = slicer[1]
|
||
|
|
||
|
return self.values[slicer]
|
||
|
|
||
|
def formatting_values(self):
|
||
|
return self.values._formatting_values()
|
||
|
|
||
|
def concat_same_type(self, to_concat, placement=None):
|
||
|
"""
|
||
|
Concatenate list of single blocks of the same type.
|
||
|
"""
|
||
|
values = self._holder._concat_same_type(
|
||
|
[blk.values for blk in to_concat])
|
||
|
placement = placement or slice(0, len(values), 1)
|
||
|
return self.make_block_same_class(values, ndim=self.ndim,
|
||
|
placement=placement)
|
||
|
|
||
|
def fillna(self, value, limit=None, inplace=False, downcast=None,
|
||
|
mgr=None):
|
||
|
values = self.values if inplace else self.values.copy()
|
||
|
values = values.fillna(value=value, limit=limit)
|
||
|
return [self.make_block_same_class(values=values,
|
||
|
placement=self.mgr_locs,
|
||
|
ndim=self.ndim)]
|
||
|
|
||
|
def interpolate(self, method='pad', axis=0, inplace=False, limit=None,
|
||
|
fill_value=None, **kwargs):
|
||
|
|
||
|
values = self.values if inplace else self.values.copy()
|
||
|
return self.make_block_same_class(
|
||
|
values=values.fillna(value=fill_value, method=method,
|
||
|
limit=limit),
|
||
|
placement=self.mgr_locs)
|
||
|
|
||
|
|
||
|
class NumericBlock(Block):
|
||
|
__slots__ = ()
|
||
|
is_numeric = True
|
||
|
_can_hold_na = True
|
||
|
|
||
|
|
||
|
class FloatOrComplexBlock(NumericBlock):
|
||
|
__slots__ = ()
|
||
|
|
||
|
def equals(self, other):
|
||
|
if self.dtype != other.dtype or self.shape != other.shape:
|
||
|
return False
|
||
|
left, right = self.values, other.values
|
||
|
return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
|
||
|
|
||
|
|
||
|
class FloatBlock(FloatOrComplexBlock):
|
||
|
__slots__ = ()
|
||
|
is_float = True
|
||
|
|
||
|
def _can_hold_element(self, element):
|
||
|
tipo = maybe_infer_dtype_type(element)
|
||
|
if tipo is not None:
|
||
|
return (issubclass(tipo.type, (np.floating, np.integer)) and
|
||
|
not issubclass(tipo.type, (np.datetime64, np.timedelta64)))
|
||
|
return (
|
||
|
isinstance(
|
||
|
element, (float, int, np.floating, np.int_, compat.long))
|
||
|
and not isinstance(element, (bool, np.bool_, datetime, timedelta,
|
||
|
np.datetime64, np.timedelta64)))
|
||
|
|
||
|
def to_native_types(self, slicer=None, na_rep='', float_format=None,
|
||
|
decimal='.', quoting=None, **kwargs):
|
||
|
""" convert to our native types format, slicing if desired """
|
||
|
|
||
|
values = self.values
|
||
|
if slicer is not None:
|
||
|
values = values[:, slicer]
|
||
|
|
||
|
# see gh-13418: no special formatting is desired at the
|
||
|
# output (important for appropriate 'quoting' behaviour),
|
||
|
# so do not pass it through the FloatArrayFormatter
|
||
|
if float_format is None and decimal == '.':
|
||
|
mask = isna(values)
|
||
|
|
||
|
if not quoting:
|
||
|
values = values.astype(str)
|
||
|
else:
|
||
|
values = np.array(values, dtype='object')
|
||
|
|
||
|
values[mask] = na_rep
|
||
|
return values
|
||
|
|
||
|
from pandas.io.formats.format import FloatArrayFormatter
|
||
|
formatter = FloatArrayFormatter(values, na_rep=na_rep,
|
||
|
float_format=float_format,
|
||
|
decimal=decimal, quoting=quoting,
|
||
|
fixed_width=False)
|
||
|
return formatter.get_result_as_array()
|
||
|
|
||
|
def should_store(self, value):
|
||
|
# when inserting a column should not coerce integers to floats
|
||
|
# unnecessarily
|
||
|
return (issubclass(value.dtype.type, np.floating) and
|
||
|
value.dtype == self.dtype)
|
||
|
|
||
|
|
||
|
class ComplexBlock(FloatOrComplexBlock):
|
||
|
__slots__ = ()
|
||
|
is_complex = True
|
||
|
|
||
|
def _can_hold_element(self, element):
|
||
|
tipo = maybe_infer_dtype_type(element)
|
||
|
if tipo is not None:
|
||
|
return issubclass(tipo.type,
|
||
|
(np.floating, np.integer, np.complexfloating))
|
||
|
return (
|
||
|
isinstance(
|
||
|
element,
|
||
|
(float, int, complex, np.float_, np.int_, compat.long))
|
||
|
and not isinstance(element, (bool, np.bool_)))
|
||
|
|
||
|
def should_store(self, value):
|
||
|
return issubclass(value.dtype.type, np.complexfloating)
|
||
|
|
||
|
|
||
|
class IntBlock(NumericBlock):
|
||
|
__slots__ = ()
|
||
|
is_integer = True
|
||
|
_can_hold_na = False
|
||
|
|
||
|
def _can_hold_element(self, element):
|
||
|
tipo = maybe_infer_dtype_type(element)
|
||
|
if tipo is not None:
|
||
|
return (issubclass(tipo.type, np.integer) and
|
||
|
not issubclass(tipo.type, (np.datetime64,
|
||
|
np.timedelta64)) and
|
||
|
self.dtype.itemsize >= tipo.itemsize)
|
||
|
return is_integer(element)
|
||
|
|
||
|
def should_store(self, value):
|
||
|
return is_integer_dtype(value) and value.dtype == self.dtype
|
||
|
|
||
|
|
||
|
class DatetimeLikeBlockMixin(object):
|
||
|
"""Mixin class for DatetimeBlock and DatetimeTZBlock."""
|
||
|
|
||
|
@property
|
||
|
def _holder(self):
|
||
|
return DatetimeIndex
|
||
|
|
||
|
@property
|
||
|
def _na_value(self):
|
||
|
return tslib.NaT
|
||
|
|
||
|
@property
|
||
|
def fill_value(self):
|
||
|
return tslib.iNaT
|
||
|
|
||
|
def get_values(self, dtype=None):
|
||
|
"""
|
||
|
return object dtype as boxed values, such as Timestamps/Timedelta
|
||
|
"""
|
||
|
if is_object_dtype(dtype):
|
||
|
return lib.map_infer(self.values.ravel(),
|
||
|
self._box_func).reshape(self.values.shape)
|
||
|
return self.values
|
||
|
|
||
|
|
||
|
class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock):
|
||
|
__slots__ = ()
|
||
|
is_timedelta = True
|
||
|
_can_hold_na = True
|
||
|
is_numeric = False
|
||
|
|
||
|
def __init__(self, values, placement, ndim=None):
|
||
|
if values.dtype != _TD_DTYPE:
|
||
|
values = conversion.ensure_timedelta64ns(values)
|
||
|
|
||
|
super(TimeDeltaBlock, self).__init__(values,
|
||
|
placement=placement, ndim=ndim)
|
||
|
|
||
|
@property
|
||
|
def _holder(self):
|
||
|
return TimedeltaIndex
|
||
|
|
||
|
@property
|
||
|
def _box_func(self):
|
||
|
return lambda x: tslib.Timedelta(x, unit='ns')
|
||
|
|
||
|
def _can_hold_element(self, element):
|
||
|
tipo = maybe_infer_dtype_type(element)
|
||
|
if tipo is not None:
|
||
|
return issubclass(tipo.type, np.timedelta64)
|
||
|
return is_integer(element) or isinstance(
|
||
|
element, (timedelta, np.timedelta64))
|
||
|
|
||
|
def fillna(self, value, **kwargs):
|
||
|
|
||
|
# allow filling with integers to be
|
||
|
# interpreted as seconds
|
||
|
if is_integer(value) and not isinstance(value, np.timedelta64):
|
||
|
value = Timedelta(value, unit='s')
|
||
|
return super(TimeDeltaBlock, self).fillna(value, **kwargs)
|
||
|
|
||
|
def _try_coerce_args(self, values, other):
|
||
|
"""
|
||
|
Coerce values and other to int64, with null values converted to
|
||
|
iNaT. values is always ndarray-like, other may not be
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
values : ndarray-like
|
||
|
other : ndarray-like or scalar
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
base-type values, values mask, base-type other, other mask
|
||
|
"""
|
||
|
|
||
|
values_mask = isna(values)
|
||
|
values = values.view('i8')
|
||
|
other_mask = False
|
||
|
|
||
|
if isinstance(other, bool):
|
||
|
raise TypeError
|
||
|
elif is_null_datelike_scalar(other):
|
||
|
other = tslib.iNaT
|
||
|
other_mask = True
|
||
|
elif isinstance(other, Timedelta):
|
||
|
other_mask = isna(other)
|
||
|
other = other.value
|
||
|
elif isinstance(other, timedelta):
|
||
|
other = Timedelta(other).value
|
||
|
elif isinstance(other, np.timedelta64):
|
||
|
other_mask = isna(other)
|
||
|
other = Timedelta(other).value
|
||
|
elif hasattr(other, 'dtype') and is_timedelta64_dtype(other):
|
||
|
other_mask = isna(other)
|
||
|
other = other.astype('i8', copy=False).view('i8')
|
||
|
else:
|
||
|
# coercion issues
|
||
|
# let higher levels handle
|
||
|
raise TypeError
|
||
|
|
||
|
return values, values_mask, other, other_mask
|
||
|
|
||
|
def _try_coerce_result(self, result):
|
||
|
""" reverse of try_coerce_args / try_operate """
|
||
|
if isinstance(result, np.ndarray):
|
||
|
mask = isna(result)
|
||
|
if result.dtype.kind in ['i', 'f', 'O']:
|
||
|
result = result.astype('m8[ns]')
|
||
|
result[mask] = tslib.iNaT
|
||
|
elif isinstance(result, (np.integer, np.float)):
|
||
|
result = self._box_func(result)
|
||
|
return result
|
||
|
|
||
|
def should_store(self, value):
|
||
|
return issubclass(value.dtype.type, np.timedelta64)
|
||
|
|
||
|
def to_native_types(self, slicer=None, na_rep=None, quoting=None,
|
||
|
**kwargs):
|
||
|
""" convert to our native types format, slicing if desired """
|
||
|
|
||
|
values = self.values
|
||
|
if slicer is not None:
|
||
|
values = values[:, slicer]
|
||
|
mask = isna(values)
|
||
|
|
||
|
rvalues = np.empty(values.shape, dtype=object)
|
||
|
if na_rep is None:
|
||
|
na_rep = 'NaT'
|
||
|
rvalues[mask] = na_rep
|
||
|
imask = (~mask).ravel()
|
||
|
|
||
|
# FIXME:
|
||
|
# should use the formats.format.Timedelta64Formatter here
|
||
|
# to figure what format to pass to the Timedelta
|
||
|
# e.g. to not show the decimals say
|
||
|
rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all')
|
||
|
for val in values.ravel()[imask]],
|
||
|
dtype=object)
|
||
|
return rvalues
|
||
|
|
||
|
|
||
|
class BoolBlock(NumericBlock):
|
||
|
__slots__ = ()
|
||
|
is_bool = True
|
||
|
_can_hold_na = False
|
||
|
|
||
|
def _can_hold_element(self, element):
|
||
|
tipo = maybe_infer_dtype_type(element)
|
||
|
if tipo is not None:
|
||
|
return issubclass(tipo.type, np.bool_)
|
||
|
return isinstance(element, (bool, np.bool_))
|
||
|
|
||
|
def should_store(self, value):
|
||
|
return issubclass(value.dtype.type, np.bool_)
|
||
|
|
||
|
def replace(self, to_replace, value, inplace=False, filter=None,
|
||
|
regex=False, convert=True, mgr=None):
|
||
|
inplace = validate_bool_kwarg(inplace, 'inplace')
|
||
|
to_replace_values = np.atleast_1d(to_replace)
|
||
|
if not np.can_cast(to_replace_values, bool):
|
||
|
return self
|
||
|
return super(BoolBlock, self).replace(to_replace, value,
|
||
|
inplace=inplace, filter=filter,
|
||
|
regex=regex, convert=convert,
|
||
|
mgr=mgr)
|
||
|
|
||
|
|
||
|
class ObjectBlock(Block):
|
||
|
__slots__ = ()
|
||
|
is_object = True
|
||
|
_can_hold_na = True
|
||
|
|
||
|
def __init__(self, values, placement=None, ndim=2):
|
||
|
if issubclass(values.dtype.type, compat.string_types):
|
||
|
values = np.array(values, dtype=object)
|
||
|
|
||
|
super(ObjectBlock, self).__init__(values, ndim=ndim,
|
||
|
placement=placement)
|
||
|
|
||
|
@property
|
||
|
def is_bool(self):
|
||
|
""" we can be a bool if we have only bool values but are of type
|
||
|
object
|
||
|
"""
|
||
|
return lib.is_bool_array(self.values.ravel())
|
||
|
|
||
|
# TODO: Refactor when convert_objects is removed since there will be 1 path
|
||
|
def convert(self, *args, **kwargs):
|
||
|
""" attempt to coerce any object types to better types return a copy of
|
||
|
the block (if copy = True) by definition we ARE an ObjectBlock!!!!!
|
||
|
|
||
|
can return multiple blocks!
|
||
|
"""
|
||
|
|
||
|
if args:
|
||
|
raise NotImplementedError
|
||
|
by_item = True if 'by_item' not in kwargs else kwargs['by_item']
|
||
|
|
||
|
new_inputs = ['coerce', 'datetime', 'numeric', 'timedelta']
|
||
|
new_style = False
|
||
|
for kw in new_inputs:
|
||
|
new_style |= kw in kwargs
|
||
|
|
||
|
if new_style:
|
||
|
fn = soft_convert_objects
|
||
|
fn_inputs = new_inputs
|
||
|
else:
|
||
|
fn = maybe_convert_objects
|
||
|
fn_inputs = ['convert_dates', 'convert_numeric',
|
||
|
'convert_timedeltas']
|
||
|
fn_inputs += ['copy']
|
||
|
|
||
|
fn_kwargs = {}
|
||
|
for key in fn_inputs:
|
||
|
if key in kwargs:
|
||
|
fn_kwargs[key] = kwargs[key]
|
||
|
|
||
|
# operate column-by-column
|
||
|
def f(m, v, i):
|
||
|
shape = v.shape
|
||
|
values = fn(v.ravel(), **fn_kwargs)
|
||
|
try:
|
||
|
values = values.reshape(shape)
|
||
|
values = _block_shape(values, ndim=self.ndim)
|
||
|
except (AttributeError, NotImplementedError):
|
||
|
pass
|
||
|
|
||
|
return values
|
||
|
|
||
|
if by_item and not self._is_single_block:
|
||
|
blocks = self.split_and_operate(None, f, False)
|
||
|
else:
|
||
|
values = f(None, self.values.ravel(), None)
|
||
|
blocks = [make_block(values, ndim=self.ndim,
|
||
|
placement=self.mgr_locs)]
|
||
|
|
||
|
return blocks
|
||
|
|
||
|
def set(self, locs, values, check=False):
|
||
|
"""
|
||
|
Modify Block in-place with new item value
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
None
|
||
|
"""
|
||
|
|
||
|
# GH6026
|
||
|
if check:
|
||
|
try:
|
||
|
if (self.values[locs] == values).all():
|
||
|
return
|
||
|
except:
|
||
|
pass
|
||
|
try:
|
||
|
self.values[locs] = values
|
||
|
except (ValueError):
|
||
|
|
||
|
# broadcasting error
|
||
|
# see GH6171
|
||
|
new_shape = list(values.shape)
|
||
|
new_shape[0] = len(self.items)
|
||
|
self.values = np.empty(tuple(new_shape), dtype=self.dtype)
|
||
|
self.values.fill(np.nan)
|
||
|
self.values[locs] = values
|
||
|
|
||
|
def _maybe_downcast(self, blocks, downcast=None):
|
||
|
|
||
|
if downcast is not None:
|
||
|
return blocks
|
||
|
|
||
|
# split and convert the blocks
|
||
|
return _extend_blocks([b.convert(datetime=True, numeric=False)
|
||
|
for b in blocks])
|
||
|
|
||
|
def _can_hold_element(self, element):
|
||
|
return True
|
||
|
|
||
|
def _try_coerce_args(self, values, other):
|
||
|
""" provide coercion to our input arguments """
|
||
|
|
||
|
if isinstance(other, ABCDatetimeIndex):
|
||
|
# to store DatetimeTZBlock as object
|
||
|
other = other.astype(object).values
|
||
|
|
||
|
return values, False, other, False
|
||
|
|
||
|
def should_store(self, value):
|
||
|
return not (issubclass(value.dtype.type,
|
||
|
(np.integer, np.floating, np.complexfloating,
|
||
|
np.datetime64, np.bool_)) or
|
||
|
# TODO(ExtensionArray): remove is_extension_type
|
||
|
# when all extension arrays have been ported.
|
||
|
is_extension_type(value) or
|
||
|
is_extension_array_dtype(value))
|
||
|
|
||
|
def replace(self, to_replace, value, inplace=False, filter=None,
|
||
|
regex=False, convert=True, mgr=None):
|
||
|
to_rep_is_list = is_list_like(to_replace)
|
||
|
value_is_list = is_list_like(value)
|
||
|
both_lists = to_rep_is_list and value_is_list
|
||
|
either_list = to_rep_is_list or value_is_list
|
||
|
|
||
|
result_blocks = []
|
||
|
blocks = [self]
|
||
|
|
||
|
if not either_list and is_re(to_replace):
|
||
|
return self._replace_single(to_replace, value, inplace=inplace,
|
||
|
filter=filter, regex=True,
|
||
|
convert=convert, mgr=mgr)
|
||
|
elif not (either_list or regex):
|
||
|
return super(ObjectBlock, self).replace(to_replace, value,
|
||
|
inplace=inplace,
|
||
|
filter=filter, regex=regex,
|
||
|
convert=convert, mgr=mgr)
|
||
|
elif both_lists:
|
||
|
for to_rep, v in zip(to_replace, value):
|
||
|
result_blocks = []
|
||
|
for b in blocks:
|
||
|
result = b._replace_single(to_rep, v, inplace=inplace,
|
||
|
filter=filter, regex=regex,
|
||
|
convert=convert, mgr=mgr)
|
||
|
result_blocks = _extend_blocks(result, result_blocks)
|
||
|
blocks = result_blocks
|
||
|
return result_blocks
|
||
|
|
||
|
elif to_rep_is_list and regex:
|
||
|
for to_rep in to_replace:
|
||
|
result_blocks = []
|
||
|
for b in blocks:
|
||
|
result = b._replace_single(to_rep, value, inplace=inplace,
|
||
|
filter=filter, regex=regex,
|
||
|
convert=convert, mgr=mgr)
|
||
|
result_blocks = _extend_blocks(result, result_blocks)
|
||
|
blocks = result_blocks
|
||
|
return result_blocks
|
||
|
|
||
|
return self._replace_single(to_replace, value, inplace=inplace,
|
||
|
filter=filter, convert=convert,
|
||
|
regex=regex, mgr=mgr)
|
||
|
|
||
|
def _replace_single(self, to_replace, value, inplace=False, filter=None,
|
||
|
regex=False, convert=True, mgr=None):
|
||
|
|
||
|
inplace = validate_bool_kwarg(inplace, 'inplace')
|
||
|
|
||
|
# to_replace is regex compilable
|
||
|
to_rep_re = regex and is_re_compilable(to_replace)
|
||
|
|
||
|
# regex is regex compilable
|
||
|
regex_re = is_re_compilable(regex)
|
||
|
|
||
|
# only one will survive
|
||
|
if to_rep_re and regex_re:
|
||
|
raise AssertionError('only one of to_replace and regex can be '
|
||
|
'regex compilable')
|
||
|
|
||
|
# if regex was passed as something that can be a regex (rather than a
|
||
|
# boolean)
|
||
|
if regex_re:
|
||
|
to_replace = regex
|
||
|
|
||
|
regex = regex_re or to_rep_re
|
||
|
|
||
|
# try to get the pattern attribute (compiled re) or it's a string
|
||
|
try:
|
||
|
pattern = to_replace.pattern
|
||
|
except AttributeError:
|
||
|
pattern = to_replace
|
||
|
|
||
|
# if the pattern is not empty and to_replace is either a string or a
|
||
|
# regex
|
||
|
if regex and pattern:
|
||
|
rx = re.compile(to_replace)
|
||
|
else:
|
||
|
# if the thing to replace is not a string or compiled regex call
|
||
|
# the superclass method -> to_replace is some kind of object
|
||
|
return super(ObjectBlock, self).replace(to_replace, value,
|
||
|
inplace=inplace,
|
||
|
filter=filter, regex=regex,
|
||
|
mgr=mgr)
|
||
|
|
||
|
new_values = self.values if inplace else self.values.copy()
|
||
|
|
||
|
# deal with replacing values with objects (strings) that match but
|
||
|
# whose replacement is not a string (numeric, nan, object)
|
||
|
if isna(value) or not isinstance(value, compat.string_types):
|
||
|
|
||
|
def re_replacer(s):
|
||
|
try:
|
||
|
return value if rx.search(s) is not None else s
|
||
|
except TypeError:
|
||
|
return s
|
||
|
else:
|
||
|
# value is guaranteed to be a string here, s can be either a string
|
||
|
# or null if it's null it gets returned
|
||
|
def re_replacer(s):
|
||
|
try:
|
||
|
return rx.sub(value, s)
|
||
|
except TypeError:
|
||
|
return s
|
||
|
|
||
|
f = np.vectorize(re_replacer, otypes=[self.dtype])
|
||
|
|
||
|
if filter is None:
|
||
|
filt = slice(None)
|
||
|
else:
|
||
|
filt = self.mgr_locs.isin(filter).nonzero()[0]
|
||
|
|
||
|
new_values[filt] = f(new_values[filt])
|
||
|
|
||
|
# convert
|
||
|
block = self.make_block(new_values)
|
||
|
if convert:
|
||
|
block = block.convert(by_item=True, numeric=False)
|
||
|
|
||
|
return block
|
||
|
|
||
|
|
||
|
class CategoricalBlock(ExtensionBlock):
|
||
|
__slots__ = ()
|
||
|
is_categorical = True
|
||
|
_verify_integrity = True
|
||
|
_can_hold_na = True
|
||
|
_concatenator = staticmethod(_concat._concat_categorical)
|
||
|
|
||
|
def __init__(self, values, placement, ndim=None):
|
||
|
from pandas.core.arrays.categorical import _maybe_to_categorical
|
||
|
|
||
|
# coerce to categorical if we can
|
||
|
super(CategoricalBlock, self).__init__(_maybe_to_categorical(values),
|
||
|
placement=placement,
|
||
|
ndim=ndim)
|
||
|
|
||
|
@property
|
||
|
def _holder(self):
|
||
|
return Categorical
|
||
|
|
||
|
@property
|
||
|
def array_dtype(self):
|
||
|
""" the dtype to return if I want to construct this block as an
|
||
|
array
|
||
|
"""
|
||
|
return np.object_
|
||
|
|
||
|
def _try_coerce_result(self, result):
|
||
|
""" reverse of try_coerce_args """
|
||
|
|
||
|
# GH12564: CategoricalBlock is 1-dim only
|
||
|
# while returned results could be any dim
|
||
|
if ((not is_categorical_dtype(result)) and
|
||
|
isinstance(result, np.ndarray)):
|
||
|
result = _block_shape(result, ndim=self.ndim)
|
||
|
|
||
|
return result
|
||
|
|
||
|
def shift(self, periods, axis=0, mgr=None):
|
||
|
return self.make_block_same_class(values=self.values.shift(periods),
|
||
|
placement=self.mgr_locs)
|
||
|
|
||
|
def to_dense(self):
|
||
|
# Categorical.get_values returns a DatetimeIndex for datetime
|
||
|
# categories, so we can't simply use `np.asarray(self.values)` like
|
||
|
# other types.
|
||
|
return self.values.get_values()
|
||
|
|
||
|
def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
|
||
|
""" convert to our native types format, slicing if desired """
|
||
|
|
||
|
values = self.values
|
||
|
if slicer is not None:
|
||
|
# Categorical is always one dimension
|
||
|
values = values[slicer]
|
||
|
mask = isna(values)
|
||
|
values = np.array(values, dtype='object')
|
||
|
values[mask] = na_rep
|
||
|
|
||
|
# we are expected to return a 2-d ndarray
|
||
|
return values.reshape(1, len(values))
|
||
|
|
||
|
def concat_same_type(self, to_concat, placement=None):
|
||
|
"""
|
||
|
Concatenate list of single blocks of the same type.
|
||
|
|
||
|
Note that this CategoricalBlock._concat_same_type *may* not
|
||
|
return a CategoricalBlock. When the categories in `to_concat`
|
||
|
differ, this will return an object ndarray.
|
||
|
|
||
|
If / when we decide we don't like that behavior:
|
||
|
|
||
|
1. Change Categorical._concat_same_type to use union_categoricals
|
||
|
2. Delete this method.
|
||
|
"""
|
||
|
values = self._concatenator([blk.values for blk in to_concat],
|
||
|
axis=self.ndim - 1)
|
||
|
# not using self.make_block_same_class as values can be object dtype
|
||
|
return make_block(
|
||
|
values, placement=placement or slice(0, len(values), 1),
|
||
|
ndim=self.ndim)
|
||
|
|
||
|
|
||
|
class DatetimeBlock(DatetimeLikeBlockMixin, Block):
|
||
|
__slots__ = ()
|
||
|
is_datetime = True
|
||
|
_can_hold_na = True
|
||
|
|
||
|
def __init__(self, values, placement, ndim=None):
|
||
|
values = self._maybe_coerce_values(values)
|
||
|
super(DatetimeBlock, self).__init__(values,
|
||
|
placement=placement, ndim=ndim)
|
||
|
|
||
|
def _maybe_coerce_values(self, values):
|
||
|
"""Input validation for values passed to __init__. Ensure that
|
||
|
we have datetime64ns, coercing if necessary.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
values : array-like
|
||
|
Must be convertible to datetime64
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
values : ndarray[datetime64ns]
|
||
|
|
||
|
Overridden by DatetimeTZBlock.
|
||
|
"""
|
||
|
if values.dtype != _NS_DTYPE:
|
||
|
values = conversion.ensure_datetime64ns(values)
|
||
|
return values
|
||
|
|
||
|
def _astype(self, dtype, mgr=None, **kwargs):
|
||
|
"""
|
||
|
these automatically copy, so copy=True has no effect
|
||
|
raise on an except if raise == True
|
||
|
"""
|
||
|
|
||
|
# if we are passed a datetime64[ns, tz]
|
||
|
if is_datetime64tz_dtype(dtype):
|
||
|
dtype = DatetimeTZDtype(dtype)
|
||
|
|
||
|
values = self.values
|
||
|
if getattr(values, 'tz', None) is None:
|
||
|
values = DatetimeIndex(values).tz_localize('UTC')
|
||
|
values = values.tz_convert(dtype.tz)
|
||
|
return self.make_block(values)
|
||
|
|
||
|
# delegate
|
||
|
return super(DatetimeBlock, self)._astype(dtype=dtype, **kwargs)
|
||
|
|
||
|
def _can_hold_element(self, element):
|
||
|
tipo = maybe_infer_dtype_type(element)
|
||
|
if tipo is not None:
|
||
|
# TODO: this still uses asarray, instead of dtype.type
|
||
|
element = np.array(element)
|
||
|
return element.dtype == _NS_DTYPE or element.dtype == np.int64
|
||
|
return (is_integer(element) or isinstance(element, datetime) or
|
||
|
isna(element))
|
||
|
|
||
|
def _try_coerce_args(self, values, other):
|
||
|
"""
|
||
|
Coerce values and other to dtype 'i8'. NaN and NaT convert to
|
||
|
the smallest i8, and will correctly round-trip to NaT if converted
|
||
|
back in _try_coerce_result. values is always ndarray-like, other
|
||
|
may not be
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
values : ndarray-like
|
||
|
other : ndarray-like or scalar
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
base-type values, values mask, base-type other, other mask
|
||
|
"""
|
||
|
|
||
|
values_mask = isna(values)
|
||
|
values = values.view('i8')
|
||
|
other_mask = False
|
||
|
|
||
|
if isinstance(other, bool):
|
||
|
raise TypeError
|
||
|
elif is_null_datelike_scalar(other):
|
||
|
other = tslib.iNaT
|
||
|
other_mask = True
|
||
|
elif isinstance(other, (datetime, np.datetime64, date)):
|
||
|
other = self._box_func(other)
|
||
|
if getattr(other, 'tz') is not None:
|
||
|
raise TypeError("cannot coerce a Timestamp with a tz on a "
|
||
|
"naive Block")
|
||
|
other_mask = isna(other)
|
||
|
other = other.asm8.view('i8')
|
||
|
elif hasattr(other, 'dtype') and is_datetime64_dtype(other):
|
||
|
other_mask = isna(other)
|
||
|
other = other.astype('i8', copy=False).view('i8')
|
||
|
else:
|
||
|
# coercion issues
|
||
|
# let higher levels handle
|
||
|
raise TypeError
|
||
|
|
||
|
return values, values_mask, other, other_mask
|
||
|
|
||
|
def _try_coerce_result(self, result):
|
||
|
""" reverse of try_coerce_args """
|
||
|
if isinstance(result, np.ndarray):
|
||
|
if result.dtype.kind in ['i', 'f', 'O']:
|
||
|
try:
|
||
|
result = result.astype('M8[ns]')
|
||
|
except ValueError:
|
||
|
pass
|
||
|
elif isinstance(result, (np.integer, np.float, np.datetime64)):
|
||
|
result = self._box_func(result)
|
||
|
return result
|
||
|
|
||
|
@property
|
||
|
def _box_func(self):
|
||
|
return tslib.Timestamp
|
||
|
|
||
|
def to_native_types(self, slicer=None, na_rep=None, date_format=None,
|
||
|
quoting=None, **kwargs):
|
||
|
""" convert to our native types format, slicing if desired """
|
||
|
|
||
|
values = self.values
|
||
|
if slicer is not None:
|
||
|
values = values[..., slicer]
|
||
|
|
||
|
from pandas.io.formats.format import _get_format_datetime64_from_values
|
||
|
format = _get_format_datetime64_from_values(values, date_format)
|
||
|
|
||
|
result = tslib.format_array_from_datetime(
|
||
|
values.view('i8').ravel(), tz=getattr(self.values, 'tz', None),
|
||
|
format=format, na_rep=na_rep).reshape(values.shape)
|
||
|
return np.atleast_2d(result)
|
||
|
|
||
|
def should_store(self, value):
|
||
|
return (issubclass(value.dtype.type, np.datetime64) and
|
||
|
not is_datetimetz(value))
|
||
|
|
||
|
def set(self, locs, values, check=False):
|
||
|
"""
|
||
|
Modify Block in-place with new item value
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
None
|
||
|
"""
|
||
|
if values.dtype != _NS_DTYPE:
|
||
|
# Workaround for numpy 1.6 bug
|
||
|
values = conversion.ensure_datetime64ns(values)
|
||
|
|
||
|
self.values[locs] = values
|
||
|
|
||
|
|
||
|
class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock):
|
||
|
""" implement a datetime64 block with a tz attribute """
|
||
|
__slots__ = ()
|
||
|
_concatenator = staticmethod(_concat._concat_datetime)
|
||
|
is_datetimetz = True
|
||
|
|
||
|
def __init__(self, values, placement, ndim=2, dtype=None):
|
||
|
# XXX: This will end up calling _maybe_coerce_values twice
|
||
|
# when dtype is not None. It's relatively cheap (just an isinstance)
|
||
|
# but it'd nice to avoid.
|
||
|
#
|
||
|
# If we can remove dtype from __init__, and push that conversion
|
||
|
# push onto the callers, then we can remove this entire __init__
|
||
|
# and just use DatetimeBlock's.
|
||
|
if dtype is not None:
|
||
|
values = self._maybe_coerce_values(values, dtype=dtype)
|
||
|
super(DatetimeTZBlock, self).__init__(values, placement=placement,
|
||
|
ndim=ndim)
|
||
|
|
||
|
def _maybe_coerce_values(self, values, dtype=None):
|
||
|
"""Input validation for values passed to __init__. Ensure that
|
||
|
we have datetime64TZ, coercing if necessary.
|
||
|
|
||
|
Parametetrs
|
||
|
-----------
|
||
|
values : array-like
|
||
|
Must be convertible to datetime64
|
||
|
dtype : string or DatetimeTZDtype, optional
|
||
|
Does a shallow copy to this tz
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
values : ndarray[datetime64ns]
|
||
|
"""
|
||
|
if not isinstance(values, self._holder):
|
||
|
values = self._holder(values)
|
||
|
|
||
|
if dtype is not None:
|
||
|
if isinstance(dtype, compat.string_types):
|
||
|
dtype = DatetimeTZDtype.construct_from_string(dtype)
|
||
|
values = values._shallow_copy(tz=dtype.tz)
|
||
|
|
||
|
if values.tz is None:
|
||
|
raise ValueError("cannot create a DatetimeTZBlock without a tz")
|
||
|
|
||
|
return values
|
||
|
|
||
|
@property
|
||
|
def is_view(self):
|
||
|
""" return a boolean if I am possibly a view """
|
||
|
# check the ndarray values of the DatetimeIndex values
|
||
|
return self.values.values.base is not None
|
||
|
|
||
|
def copy(self, deep=True, mgr=None):
|
||
|
""" copy constructor """
|
||
|
values = self.values
|
||
|
if deep:
|
||
|
values = values.copy(deep=True)
|
||
|
return self.make_block_same_class(values)
|
||
|
|
||
|
def external_values(self):
|
||
|
""" we internally represent the data as a DatetimeIndex, but for
|
||
|
external compat with ndarray, export as a ndarray of Timestamps
|
||
|
"""
|
||
|
return self.values.astype('datetime64[ns]').values
|
||
|
|
||
|
def get_values(self, dtype=None):
|
||
|
# return object dtype as Timestamps with the zones
|
||
|
if is_object_dtype(dtype):
|
||
|
return lib.map_infer(
|
||
|
self.values.ravel(), self._box_func).reshape(self.values.shape)
|
||
|
return self.values
|
||
|
|
||
|
def _slice(self, slicer):
|
||
|
""" return a slice of my values """
|
||
|
if isinstance(slicer, tuple):
|
||
|
col, loc = slicer
|
||
|
if not com.is_null_slice(col) and col != 0:
|
||
|
raise IndexError("{0} only contains one item".format(self))
|
||
|
return self.values[loc]
|
||
|
return self.values[slicer]
|
||
|
|
||
|
def _try_coerce_args(self, values, other):
|
||
|
"""
|
||
|
localize and return i8 for the values
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
values : ndarray-like
|
||
|
other : ndarray-like or scalar
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
base-type values, values mask, base-type other, other mask
|
||
|
"""
|
||
|
values_mask = _block_shape(isna(values), ndim=self.ndim)
|
||
|
# asi8 is a view, needs copy
|
||
|
values = _block_shape(values.asi8, ndim=self.ndim)
|
||
|
other_mask = False
|
||
|
|
||
|
if isinstance(other, ABCSeries):
|
||
|
other = self._holder(other)
|
||
|
other_mask = isna(other)
|
||
|
|
||
|
if isinstance(other, bool):
|
||
|
raise TypeError
|
||
|
elif (is_null_datelike_scalar(other) or
|
||
|
(is_scalar(other) and isna(other))):
|
||
|
other = tslib.iNaT
|
||
|
other_mask = True
|
||
|
elif isinstance(other, self._holder):
|
||
|
if other.tz != self.values.tz:
|
||
|
raise ValueError("incompatible or non tz-aware value")
|
||
|
other = other.asi8
|
||
|
other_mask = isna(other)
|
||
|
elif isinstance(other, (np.datetime64, datetime, date)):
|
||
|
other = tslib.Timestamp(other)
|
||
|
tz = getattr(other, 'tz', None)
|
||
|
|
||
|
# test we can have an equal time zone
|
||
|
if tz is None or str(tz) != str(self.values.tz):
|
||
|
raise ValueError("incompatible or non tz-aware value")
|
||
|
other_mask = isna(other)
|
||
|
other = other.value
|
||
|
else:
|
||
|
raise TypeError
|
||
|
|
||
|
return values, values_mask, other, other_mask
|
||
|
|
||
|
def _try_coerce_result(self, result):
|
||
|
""" reverse of try_coerce_args """
|
||
|
if isinstance(result, np.ndarray):
|
||
|
if result.dtype.kind in ['i', 'f', 'O']:
|
||
|
result = result.astype('M8[ns]')
|
||
|
elif isinstance(result, (np.integer, np.float, np.datetime64)):
|
||
|
result = tslib.Timestamp(result, tz=self.values.tz)
|
||
|
if isinstance(result, np.ndarray):
|
||
|
# allow passing of > 1dim if its trivial
|
||
|
if result.ndim > 1:
|
||
|
result = result.reshape(np.prod(result.shape))
|
||
|
result = self.values._shallow_copy(result)
|
||
|
|
||
|
return result
|
||
|
|
||
|
@property
|
||
|
def _box_func(self):
|
||
|
return lambda x: tslib.Timestamp(x, tz=self.dtype.tz)
|
||
|
|
||
|
def shift(self, periods, axis=0, mgr=None):
|
||
|
""" shift the block by periods """
|
||
|
|
||
|
# think about moving this to the DatetimeIndex. This is a non-freq
|
||
|
# (number of periods) shift ###
|
||
|
|
||
|
N = len(self)
|
||
|
indexer = np.zeros(N, dtype=int)
|
||
|
if periods > 0:
|
||
|
indexer[periods:] = np.arange(N - periods)
|
||
|
else:
|
||
|
indexer[:periods] = np.arange(-periods, N)
|
||
|
|
||
|
new_values = self.values.asi8.take(indexer)
|
||
|
|
||
|
if periods > 0:
|
||
|
new_values[:periods] = tslib.iNaT
|
||
|
else:
|
||
|
new_values[periods:] = tslib.iNaT
|
||
|
|
||
|
new_values = self.values._shallow_copy(new_values)
|
||
|
return [self.make_block_same_class(new_values,
|
||
|
placement=self.mgr_locs)]
|
||
|
|
||
|
def diff(self, n, axis=0, mgr=None):
|
||
|
"""1st discrete difference
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
n : int, number of periods to diff
|
||
|
axis : int, axis to diff upon. default 0
|
||
|
mgr : default None
|
||
|
|
||
|
Return
|
||
|
------
|
||
|
A list with a new TimeDeltaBlock.
|
||
|
|
||
|
Note
|
||
|
----
|
||
|
The arguments here are mimicking shift so they are called correctly
|
||
|
by apply.
|
||
|
"""
|
||
|
if axis == 0:
|
||
|
# Cannot currently calculate diff across multiple blocks since this
|
||
|
# function is invoked via apply
|
||
|
raise NotImplementedError
|
||
|
new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8
|
||
|
|
||
|
# Reshape the new_values like how algos.diff does for timedelta data
|
||
|
new_values = new_values.reshape(1, len(new_values))
|
||
|
new_values = new_values.astype('timedelta64[ns]')
|
||
|
return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)]
|
||
|
|
||
|
def concat_same_type(self, to_concat, placement=None):
|
||
|
"""
|
||
|
Concatenate list of single blocks of the same type.
|
||
|
"""
|
||
|
values = self._concatenator([blk.values for blk in to_concat],
|
||
|
axis=self.ndim - 1)
|
||
|
# not using self.make_block_same_class as values can be non-tz dtype
|
||
|
return make_block(
|
||
|
values, placement=placement or slice(0, len(values), 1))
|
||
|
|
||
|
|
||
|
class SparseBlock(NonConsolidatableMixIn, Block):
|
||
|
""" implement as a list of sparse arrays of the same dtype """
|
||
|
__slots__ = ()
|
||
|
is_sparse = True
|
||
|
is_numeric = True
|
||
|
_box_to_block_values = False
|
||
|
_can_hold_na = True
|
||
|
_ftype = 'sparse'
|
||
|
_concatenator = staticmethod(_concat._concat_sparse)
|
||
|
|
||
|
def __init__(self, values, placement, ndim=None):
|
||
|
# Ensure that we have the underlying SparseArray here...
|
||
|
if isinstance(values, ABCSeries):
|
||
|
values = values.values
|
||
|
assert isinstance(values, SparseArray)
|
||
|
super(SparseBlock, self).__init__(values, placement, ndim=ndim)
|
||
|
|
||
|
@property
|
||
|
def _holder(self):
|
||
|
return SparseArray
|
||
|
|
||
|
@property
|
||
|
def shape(self):
|
||
|
return (len(self.mgr_locs), self.sp_index.length)
|
||
|
|
||
|
@property
|
||
|
def fill_value(self):
|
||
|
# return np.nan
|
||
|
return self.values.fill_value
|
||
|
|
||
|
@fill_value.setter
|
||
|
def fill_value(self, v):
|
||
|
self.values.fill_value = v
|
||
|
|
||
|
def to_dense(self):
|
||
|
return self.values.to_dense().view()
|
||
|
|
||
|
@property
|
||
|
def sp_values(self):
|
||
|
return self.values.sp_values
|
||
|
|
||
|
@sp_values.setter
|
||
|
def sp_values(self, v):
|
||
|
# reset the sparse values
|
||
|
self.values = SparseArray(v, sparse_index=self.sp_index,
|
||
|
kind=self.kind, dtype=v.dtype,
|
||
|
fill_value=self.values.fill_value,
|
||
|
copy=False)
|
||
|
|
||
|
@property
|
||
|
def sp_index(self):
|
||
|
return self.values.sp_index
|
||
|
|
||
|
@property
|
||
|
def kind(self):
|
||
|
return self.values.kind
|
||
|
|
||
|
def _astype(self, dtype, copy=False, errors='raise', values=None,
|
||
|
klass=None, mgr=None, **kwargs):
|
||
|
if values is None:
|
||
|
values = self.values
|
||
|
values = values.astype(dtype, copy=copy)
|
||
|
return self.make_block_same_class(values=values,
|
||
|
placement=self.mgr_locs)
|
||
|
|
||
|
def __len__(self):
|
||
|
try:
|
||
|
return self.sp_index.length
|
||
|
except:
|
||
|
return 0
|
||
|
|
||
|
def copy(self, deep=True, mgr=None):
|
||
|
return self.make_block_same_class(values=self.values,
|
||
|
sparse_index=self.sp_index,
|
||
|
kind=self.kind, copy=deep,
|
||
|
placement=self.mgr_locs)
|
||
|
|
||
|
def make_block_same_class(self, values, placement, sparse_index=None,
|
||
|
kind=None, dtype=None, fill_value=None,
|
||
|
copy=False, ndim=None):
|
||
|
""" return a new block """
|
||
|
if dtype is None:
|
||
|
dtype = values.dtype
|
||
|
if fill_value is None and not isinstance(values, SparseArray):
|
||
|
fill_value = self.values.fill_value
|
||
|
|
||
|
# if not isinstance(values, SparseArray) and values.ndim != self.ndim:
|
||
|
# raise ValueError("ndim mismatch")
|
||
|
|
||
|
if values.ndim == 2:
|
||
|
nitems = values.shape[0]
|
||
|
|
||
|
if nitems == 0:
|
||
|
# kludgy, but SparseBlocks cannot handle slices, where the
|
||
|
# output is 0-item, so let's convert it to a dense block: it
|
||
|
# won't take space since there's 0 items, plus it will preserve
|
||
|
# the dtype.
|
||
|
return self.make_block(np.empty(values.shape, dtype=dtype),
|
||
|
placement)
|
||
|
elif nitems > 1:
|
||
|
raise ValueError("Only 1-item 2d sparse blocks are supported")
|
||
|
else:
|
||
|
values = values.reshape(values.shape[1])
|
||
|
|
||
|
new_values = SparseArray(values, sparse_index=sparse_index,
|
||
|
kind=kind or self.kind, dtype=dtype,
|
||
|
fill_value=fill_value, copy=copy)
|
||
|
return self.make_block(new_values,
|
||
|
placement=placement)
|
||
|
|
||
|
def interpolate(self, method='pad', axis=0, inplace=False, limit=None,
|
||
|
fill_value=None, **kwargs):
|
||
|
|
||
|
values = missing.interpolate_2d(self.values.to_dense(), method, axis,
|
||
|
limit, fill_value)
|
||
|
return self.make_block_same_class(values=values,
|
||
|
placement=self.mgr_locs)
|
||
|
|
||
|
def fillna(self, value, limit=None, inplace=False, downcast=None,
|
||
|
mgr=None):
|
||
|
# we may need to upcast our fill to match our dtype
|
||
|
if limit is not None:
|
||
|
raise NotImplementedError("specifying a limit for 'fillna' has "
|
||
|
"not been implemented yet")
|
||
|
values = self.values if inplace else self.values.copy()
|
||
|
values = values.fillna(value, downcast=downcast)
|
||
|
return [self.make_block_same_class(values=values,
|
||
|
placement=self.mgr_locs)]
|
||
|
|
||
|
def shift(self, periods, axis=0, mgr=None):
|
||
|
""" shift the block by periods """
|
||
|
N = len(self.values.T)
|
||
|
indexer = np.zeros(N, dtype=int)
|
||
|
if periods > 0:
|
||
|
indexer[periods:] = np.arange(N - periods)
|
||
|
else:
|
||
|
indexer[:periods] = np.arange(-periods, N)
|
||
|
new_values = self.values.to_dense().take(indexer)
|
||
|
# convert integer to float if necessary. need to do a lot more than
|
||
|
# that, handle boolean etc also
|
||
|
new_values, fill_value = maybe_upcast(new_values)
|
||
|
if periods > 0:
|
||
|
new_values[:periods] = fill_value
|
||
|
else:
|
||
|
new_values[periods:] = fill_value
|
||
|
return [self.make_block_same_class(new_values,
|
||
|
placement=self.mgr_locs)]
|
||
|
|
||
|
def sparse_reindex(self, new_index):
|
||
|
""" sparse reindex and return a new block
|
||
|
current reindex only works for float64 dtype! """
|
||
|
values = self.values
|
||
|
values = values.sp_index.to_int_index().reindex(
|
||
|
values.sp_values.astype('float64'), values.fill_value, new_index)
|
||
|
return self.make_block_same_class(values, sparse_index=new_index,
|
||
|
placement=self.mgr_locs)
|
||
|
|
||
|
|
||
|
def get_block_type(values, dtype=None):
|
||
|
"""
|
||
|
Find the appropriate Block subclass to use for the given values and dtype.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
values : ndarray-like
|
||
|
dtype : numpy or pandas dtype
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
cls : class, subclass of Block
|
||
|
"""
|
||
|
dtype = dtype or values.dtype
|
||
|
vtype = dtype.type
|
||
|
|
||
|
if is_sparse(values):
|
||
|
cls = SparseBlock
|
||
|
elif issubclass(vtype, np.floating):
|
||
|
cls = FloatBlock
|
||
|
elif issubclass(vtype, np.timedelta64):
|
||
|
assert issubclass(vtype, np.integer)
|
||
|
cls = TimeDeltaBlock
|
||
|
elif issubclass(vtype, np.complexfloating):
|
||
|
cls = ComplexBlock
|
||
|
elif issubclass(vtype, np.datetime64):
|
||
|
assert not is_datetimetz(values)
|
||
|
cls = DatetimeBlock
|
||
|
elif is_datetimetz(values):
|
||
|
cls = DatetimeTZBlock
|
||
|
elif issubclass(vtype, np.integer):
|
||
|
cls = IntBlock
|
||
|
elif dtype == np.bool_:
|
||
|
cls = BoolBlock
|
||
|
elif is_categorical(values):
|
||
|
cls = CategoricalBlock
|
||
|
elif is_extension_array_dtype(values):
|
||
|
cls = ExtensionBlock
|
||
|
else:
|
||
|
cls = ObjectBlock
|
||
|
return cls
|
||
|
|
||
|
|
||
|
def make_block(values, placement, klass=None, ndim=None, dtype=None,
|
||
|
fastpath=None):
|
||
|
if fastpath is not None:
|
||
|
# GH#19265 pyarrow is passing this
|
||
|
warnings.warn("fastpath argument is deprecated, will be removed "
|
||
|
"in a future release.", DeprecationWarning)
|
||
|
if klass is None:
|
||
|
dtype = dtype or values.dtype
|
||
|
klass = get_block_type(values, dtype)
|
||
|
|
||
|
elif klass is DatetimeTZBlock and not is_datetimetz(values):
|
||
|
return klass(values, ndim=ndim,
|
||
|
placement=placement, dtype=dtype)
|
||
|
|
||
|
return klass(values, ndim=ndim, placement=placement)
|
||
|
|
||
|
# TODO: flexible with index=None and/or items=None
|
||
|
|
||
|
|
||
|
class BlockManager(PandasObject):
|
||
|
"""
|
||
|
Core internal data structure to implement DataFrame, Series, Panel, etc.
|
||
|
|
||
|
Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a
|
||
|
lightweight blocked set of labeled data to be manipulated by the DataFrame
|
||
|
public API class
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
shape
|
||
|
ndim
|
||
|
axes
|
||
|
values
|
||
|
items
|
||
|
|
||
|
Methods
|
||
|
-------
|
||
|
set_axis(axis, new_labels)
|
||
|
copy(deep=True)
|
||
|
|
||
|
get_dtype_counts
|
||
|
get_ftype_counts
|
||
|
get_dtypes
|
||
|
get_ftypes
|
||
|
|
||
|
apply(func, axes, block_filter_fn)
|
||
|
|
||
|
get_bool_data
|
||
|
get_numeric_data
|
||
|
|
||
|
get_slice(slice_like, axis)
|
||
|
get(label)
|
||
|
iget(loc)
|
||
|
get_scalar(label_tup)
|
||
|
|
||
|
take(indexer, axis)
|
||
|
reindex_axis(new_labels, axis)
|
||
|
reindex_indexer(new_labels, indexer, axis)
|
||
|
|
||
|
delete(label)
|
||
|
insert(loc, label, value)
|
||
|
set(label, value)
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This is *not* a public API class
|
||
|
"""
|
||
|
__slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated',
|
||
|
'_is_consolidated', '_blknos', '_blklocs']
|
||
|
|
||
|
def __init__(self, blocks, axes, do_integrity_check=True):
|
||
|
self.axes = [_ensure_index(ax) for ax in axes]
|
||
|
self.blocks = tuple(blocks)
|
||
|
|
||
|
for block in blocks:
|
||
|
if block.is_sparse:
|
||
|
if len(block.mgr_locs) != 1:
|
||
|
raise AssertionError("Sparse block refers to multiple "
|
||
|
"items")
|
||
|
else:
|
||
|
if self.ndim != block.ndim:
|
||
|
raise AssertionError(
|
||
|
'Number of Block dimensions ({block}) must equal '
|
||
|
'number of axes ({self})'.format(block=block.ndim,
|
||
|
self=self.ndim))
|
||
|
|
||
|
if do_integrity_check:
|
||
|
self._verify_integrity()
|
||
|
|
||
|
self._consolidate_check()
|
||
|
|
||
|
self._rebuild_blknos_and_blklocs()
|
||
|
|
||
|
def make_empty(self, axes=None):
|
||
|
""" return an empty BlockManager with the items axis of len 0 """
|
||
|
if axes is None:
|
||
|
axes = [_ensure_index([])] + [_ensure_index(a)
|
||
|
for a in self.axes[1:]]
|
||
|
|
||
|
# preserve dtype if possible
|
||
|
if self.ndim == 1:
|
||
|
blocks = np.array([], dtype=self.array_dtype)
|
||
|
else:
|
||
|
blocks = []
|
||
|
return self.__class__(blocks, axes)
|
||
|
|
||
|
def __nonzero__(self):
|
||
|
return True
|
||
|
|
||
|
# Python3 compat
|
||
|
__bool__ = __nonzero__
|
||
|
|
||
|
@property
|
||
|
def shape(self):
|
||
|
return tuple(len(ax) for ax in self.axes)
|
||
|
|
||
|
@property
|
||
|
def ndim(self):
|
||
|
return len(self.axes)
|
||
|
|
||
|
def set_axis(self, axis, new_labels):
|
||
|
new_labels = _ensure_index(new_labels)
|
||
|
old_len = len(self.axes[axis])
|
||
|
new_len = len(new_labels)
|
||
|
|
||
|
if new_len != old_len:
|
||
|
raise ValueError(
|
||
|
'Length mismatch: Expected axis has {old} elements, new '
|
||
|
'values have {new} elements'.format(old=old_len, new=new_len))
|
||
|
|
||
|
self.axes[axis] = new_labels
|
||
|
|
||
|
def rename_axis(self, mapper, axis, copy=True, level=None):
|
||
|
"""
|
||
|
Rename one of axes.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
mapper : unary callable
|
||
|
axis : int
|
||
|
copy : boolean, default True
|
||
|
level : int, default None
|
||
|
|
||
|
"""
|
||
|
obj = self.copy(deep=copy)
|
||
|
obj.set_axis(axis, _transform_index(self.axes[axis], mapper, level))
|
||
|
return obj
|
||
|
|
||
|
def add_prefix(self, prefix):
|
||
|
f = partial('{prefix}{}'.format, prefix=prefix)
|
||
|
return self.rename_axis(f, axis=0)
|
||
|
|
||
|
def add_suffix(self, suffix):
|
||
|
f = partial('{}{suffix}'.format, suffix=suffix)
|
||
|
return self.rename_axis(f, axis=0)
|
||
|
|
||
|
@property
|
||
|
def _is_single_block(self):
|
||
|
if self.ndim == 1:
|
||
|
return True
|
||
|
|
||
|
if len(self.blocks) != 1:
|
||
|
return False
|
||
|
|
||
|
blk = self.blocks[0]
|
||
|
return (blk.mgr_locs.is_slice_like and
|
||
|
blk.mgr_locs.as_slice == slice(0, len(self), 1))
|
||
|
|
||
|
def _rebuild_blknos_and_blklocs(self):
|
||
|
"""
|
||
|
Update mgr._blknos / mgr._blklocs.
|
||
|
"""
|
||
|
new_blknos = np.empty(self.shape[0], dtype=np.int64)
|
||
|
new_blklocs = np.empty(self.shape[0], dtype=np.int64)
|
||
|
new_blknos.fill(-1)
|
||
|
new_blklocs.fill(-1)
|
||
|
|
||
|
for blkno, blk in enumerate(self.blocks):
|
||
|
rl = blk.mgr_locs
|
||
|
new_blknos[rl.indexer] = blkno
|
||
|
new_blklocs[rl.indexer] = np.arange(len(rl))
|
||
|
|
||
|
if (new_blknos == -1).any():
|
||
|
raise AssertionError("Gaps in blk ref_locs")
|
||
|
|
||
|
self._blknos = new_blknos
|
||
|
self._blklocs = new_blklocs
|
||
|
|
||
|
# make items read only for now
|
||
|
def _get_items(self):
|
||
|
return self.axes[0]
|
||
|
|
||
|
items = property(fget=_get_items)
|
||
|
|
||
|
def _get_counts(self, f):
|
||
|
""" return a dict of the counts of the function in BlockManager """
|
||
|
self._consolidate_inplace()
|
||
|
counts = dict()
|
||
|
for b in self.blocks:
|
||
|
v = f(b)
|
||
|
counts[v] = counts.get(v, 0) + b.shape[0]
|
||
|
return counts
|
||
|
|
||
|
def get_dtype_counts(self):
|
||
|
return self._get_counts(lambda b: b.dtype.name)
|
||
|
|
||
|
def get_ftype_counts(self):
|
||
|
return self._get_counts(lambda b: b.ftype)
|
||
|
|
||
|
def get_dtypes(self):
|
||
|
dtypes = np.array([blk.dtype for blk in self.blocks])
|
||
|
return algos.take_1d(dtypes, self._blknos, allow_fill=False)
|
||
|
|
||
|
def get_ftypes(self):
|
||
|
ftypes = np.array([blk.ftype for blk in self.blocks])
|
||
|
return algos.take_1d(ftypes, self._blknos, allow_fill=False)
|
||
|
|
||
|
def __getstate__(self):
|
||
|
block_values = [b.values for b in self.blocks]
|
||
|
block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
|
||
|
axes_array = [ax for ax in self.axes]
|
||
|
|
||
|
extra_state = {
|
||
|
'0.14.1': {
|
||
|
'axes': axes_array,
|
||
|
'blocks': [dict(values=b.values, mgr_locs=b.mgr_locs.indexer)
|
||
|
for b in self.blocks]
|
||
|
}
|
||
|
}
|
||
|
|
||
|
# First three elements of the state are to maintain forward
|
||
|
# compatibility with 0.13.1.
|
||
|
return axes_array, block_values, block_items, extra_state
|
||
|
|
||
|
def __setstate__(self, state):
|
||
|
def unpickle_block(values, mgr_locs):
|
||
|
# numpy < 1.7 pickle compat
|
||
|
if values.dtype == 'M8[us]':
|
||
|
values = values.astype('M8[ns]')
|
||
|
return make_block(values, placement=mgr_locs)
|
||
|
|
||
|
if (isinstance(state, tuple) and len(state) >= 4 and
|
||
|
'0.14.1' in state[3]):
|
||
|
state = state[3]['0.14.1']
|
||
|
self.axes = [_ensure_index(ax) for ax in state['axes']]
|
||
|
self.blocks = tuple(unpickle_block(b['values'], b['mgr_locs'])
|
||
|
for b in state['blocks'])
|
||
|
else:
|
||
|
# discard anything after 3rd, support beta pickling format for a
|
||
|
# little while longer
|
||
|
ax_arrays, bvalues, bitems = state[:3]
|
||
|
|
||
|
self.axes = [_ensure_index(ax) for ax in ax_arrays]
|
||
|
|
||
|
if len(bitems) == 1 and self.axes[0].equals(bitems[0]):
|
||
|
# This is a workaround for pre-0.14.1 pickles that didn't
|
||
|
# support unpickling multi-block frames/panels with non-unique
|
||
|
# columns/items, because given a manager with items ["a", "b",
|
||
|
# "a"] there's no way of knowing which block's "a" is where.
|
||
|
#
|
||
|
# Single-block case can be supported under the assumption that
|
||
|
# block items corresponded to manager items 1-to-1.
|
||
|
all_mgr_locs = [slice(0, len(bitems[0]))]
|
||
|
else:
|
||
|
all_mgr_locs = [self.axes[0].get_indexer(blk_items)
|
||
|
for blk_items in bitems]
|
||
|
|
||
|
self.blocks = tuple(
|
||
|
unpickle_block(values, mgr_locs)
|
||
|
for values, mgr_locs in zip(bvalues, all_mgr_locs))
|
||
|
|
||
|
self._post_setstate()
|
||
|
|
||
|
def _post_setstate(self):
|
||
|
self._is_consolidated = False
|
||
|
self._known_consolidated = False
|
||
|
self._rebuild_blknos_and_blklocs()
|
||
|
|
||
|
def __len__(self):
|
||
|
return len(self.items)
|
||
|
|
||
|
def __unicode__(self):
|
||
|
output = pprint_thing(self.__class__.__name__)
|
||
|
for i, ax in enumerate(self.axes):
|
||
|
if i == 0:
|
||
|
output += u('\nItems: {ax}'.format(ax=ax))
|
||
|
else:
|
||
|
output += u('\nAxis {i}: {ax}'.format(i=i, ax=ax))
|
||
|
|
||
|
for block in self.blocks:
|
||
|
output += u('\n{block}'.format(block=pprint_thing(block)))
|
||
|
return output
|
||
|
|
||
|
def _verify_integrity(self):
|
||
|
mgr_shape = self.shape
|
||
|
tot_items = sum(len(x.mgr_locs) for x in self.blocks)
|
||
|
for block in self.blocks:
|
||
|
if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:
|
||
|
construction_error(tot_items, block.shape[1:], self.axes)
|
||
|
if len(self.items) != tot_items:
|
||
|
raise AssertionError('Number of manager items must equal union of '
|
||
|
'block items\n# manager items: {0}, # '
|
||
|
'tot_items: {1}'.format(
|
||
|
len(self.items), tot_items))
|
||
|
|
||
|
def apply(self, f, axes=None, filter=None, do_integrity_check=False,
|
||
|
consolidate=True, **kwargs):
|
||
|
"""
|
||
|
iterate over the blocks, collect and create a new block manager
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
f : the callable or function name to operate on at the block level
|
||
|
axes : optional (if not supplied, use self.axes)
|
||
|
filter : list, if supplied, only call the block if the filter is in
|
||
|
the block
|
||
|
do_integrity_check : boolean, default False. Do the block manager
|
||
|
integrity check
|
||
|
consolidate: boolean, default True. Join together blocks having same
|
||
|
dtype
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Block Manager (new object)
|
||
|
|
||
|
"""
|
||
|
|
||
|
result_blocks = []
|
||
|
|
||
|
# filter kwarg is used in replace-* family of methods
|
||
|
if filter is not None:
|
||
|
filter_locs = set(self.items.get_indexer_for(filter))
|
||
|
if len(filter_locs) == len(self.items):
|
||
|
# All items are included, as if there were no filtering
|
||
|
filter = None
|
||
|
else:
|
||
|
kwargs['filter'] = filter_locs
|
||
|
|
||
|
if consolidate:
|
||
|
self._consolidate_inplace()
|
||
|
|
||
|
if f == 'where':
|
||
|
align_copy = True
|
||
|
if kwargs.get('align', True):
|
||
|
align_keys = ['other', 'cond']
|
||
|
else:
|
||
|
align_keys = ['cond']
|
||
|
elif f == 'putmask':
|
||
|
align_copy = False
|
||
|
if kwargs.get('align', True):
|
||
|
align_keys = ['new', 'mask']
|
||
|
else:
|
||
|
align_keys = ['mask']
|
||
|
elif f == 'eval':
|
||
|
align_copy = False
|
||
|
align_keys = ['other']
|
||
|
elif f == 'fillna':
|
||
|
# fillna internally does putmask, maybe it's better to do this
|
||
|
# at mgr, not block level?
|
||
|
align_copy = False
|
||
|
align_keys = ['value']
|
||
|
else:
|
||
|
align_keys = []
|
||
|
|
||
|
# TODO(EA): may interfere with ExtensionBlock.setitem for blocks
|
||
|
# with a .values attribute.
|
||
|
aligned_args = dict((k, kwargs[k])
|
||
|
for k in align_keys
|
||
|
if hasattr(kwargs[k], 'values') and
|
||
|
not isinstance(kwargs[k], ABCExtensionArray))
|
||
|
|
||
|
for b in self.blocks:
|
||
|
if filter is not None:
|
||
|
if not b.mgr_locs.isin(filter_locs).any():
|
||
|
result_blocks.append(b)
|
||
|
continue
|
||
|
|
||
|
if aligned_args:
|
||
|
b_items = self.items[b.mgr_locs.indexer]
|
||
|
|
||
|
for k, obj in aligned_args.items():
|
||
|
axis = getattr(obj, '_info_axis_number', 0)
|
||
|
kwargs[k] = obj.reindex(b_items, axis=axis,
|
||
|
copy=align_copy)
|
||
|
|
||
|
kwargs['mgr'] = self
|
||
|
applied = getattr(b, f)(**kwargs)
|
||
|
result_blocks = _extend_blocks(applied, result_blocks)
|
||
|
|
||
|
if len(result_blocks) == 0:
|
||
|
return self.make_empty(axes or self.axes)
|
||
|
bm = self.__class__(result_blocks, axes or self.axes,
|
||
|
do_integrity_check=do_integrity_check)
|
||
|
bm._consolidate_inplace()
|
||
|
return bm
|
||
|
|
||
|
def reduction(self, f, axis=0, consolidate=True, transposed=False,
|
||
|
**kwargs):
|
||
|
"""
|
||
|
iterate over the blocks, collect and create a new block manager.
|
||
|
This routine is intended for reduction type operations and
|
||
|
will do inference on the generated blocks.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
f: the callable or function name to operate on at the block level
|
||
|
axis: reduction axis, default 0
|
||
|
consolidate: boolean, default True. Join together blocks having same
|
||
|
dtype
|
||
|
transposed: boolean, default False
|
||
|
we are holding transposed data
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Block Manager (new object)
|
||
|
|
||
|
"""
|
||
|
|
||
|
if consolidate:
|
||
|
self._consolidate_inplace()
|
||
|
|
||
|
axes, blocks = [], []
|
||
|
for b in self.blocks:
|
||
|
kwargs['mgr'] = self
|
||
|
axe, block = getattr(b, f)(axis=axis, **kwargs)
|
||
|
|
||
|
axes.append(axe)
|
||
|
blocks.append(block)
|
||
|
|
||
|
# note that some DatetimeTZ, Categorical are always ndim==1
|
||
|
ndim = {b.ndim for b in blocks}
|
||
|
|
||
|
if 2 in ndim:
|
||
|
|
||
|
new_axes = list(self.axes)
|
||
|
|
||
|
# multiple blocks that are reduced
|
||
|
if len(blocks) > 1:
|
||
|
new_axes[1] = axes[0]
|
||
|
|
||
|
# reset the placement to the original
|
||
|
for b, sb in zip(blocks, self.blocks):
|
||
|
b.mgr_locs = sb.mgr_locs
|
||
|
|
||
|
else:
|
||
|
new_axes[axis] = Index(np.concatenate(
|
||
|
[ax.values for ax in axes]))
|
||
|
|
||
|
if transposed:
|
||
|
new_axes = new_axes[::-1]
|
||
|
blocks = [b.make_block(b.values.T,
|
||
|
placement=np.arange(b.shape[1])
|
||
|
) for b in blocks]
|
||
|
|
||
|
return self.__class__(blocks, new_axes)
|
||
|
|
||
|
# 0 ndim
|
||
|
if 0 in ndim and 1 not in ndim:
|
||
|
values = np.array([b.values for b in blocks])
|
||
|
if len(values) == 1:
|
||
|
return values.item()
|
||
|
blocks = [make_block(values, ndim=1)]
|
||
|
axes = Index([ax[0] for ax in axes])
|
||
|
|
||
|
# single block
|
||
|
values = _concat._concat_compat([b.values for b in blocks])
|
||
|
|
||
|
# compute the orderings of our original data
|
||
|
if len(self.blocks) > 1:
|
||
|
|
||
|
indexer = np.empty(len(self.axes[0]), dtype=np.intp)
|
||
|
i = 0
|
||
|
for b in self.blocks:
|
||
|
for j in b.mgr_locs:
|
||
|
indexer[j] = i
|
||
|
i = i + 1
|
||
|
|
||
|
values = values.take(indexer)
|
||
|
|
||
|
return SingleBlockManager(
|
||
|
[make_block(values,
|
||
|
ndim=1,
|
||
|
placement=np.arange(len(values)))],
|
||
|
axes[0])
|
||
|
|
||
|
def isna(self, func, **kwargs):
|
||
|
return self.apply('apply', func=func, **kwargs)
|
||
|
|
||
|
def where(self, **kwargs):
|
||
|
return self.apply('where', **kwargs)
|
||
|
|
||
|
def eval(self, **kwargs):
|
||
|
return self.apply('eval', **kwargs)
|
||
|
|
||
|
def quantile(self, **kwargs):
|
||
|
return self.reduction('quantile', **kwargs)
|
||
|
|
||
|
def setitem(self, **kwargs):
|
||
|
return self.apply('setitem', **kwargs)
|
||
|
|
||
|
def putmask(self, **kwargs):
|
||
|
return self.apply('putmask', **kwargs)
|
||
|
|
||
|
def diff(self, **kwargs):
|
||
|
return self.apply('diff', **kwargs)
|
||
|
|
||
|
def interpolate(self, **kwargs):
|
||
|
return self.apply('interpolate', **kwargs)
|
||
|
|
||
|
def shift(self, **kwargs):
|
||
|
return self.apply('shift', **kwargs)
|
||
|
|
||
|
def fillna(self, **kwargs):
|
||
|
return self.apply('fillna', **kwargs)
|
||
|
|
||
|
def downcast(self, **kwargs):
|
||
|
return self.apply('downcast', **kwargs)
|
||
|
|
||
|
def astype(self, dtype, **kwargs):
|
||
|
return self.apply('astype', dtype=dtype, **kwargs)
|
||
|
|
||
|
def convert(self, **kwargs):
|
||
|
return self.apply('convert', **kwargs)
|
||
|
|
||
|
def replace(self, **kwargs):
|
||
|
return self.apply('replace', **kwargs)
|
||
|
|
||
|
def replace_list(self, src_list, dest_list, inplace=False, regex=False,
|
||
|
mgr=None):
|
||
|
""" do a list replace """
|
||
|
|
||
|
inplace = validate_bool_kwarg(inplace, 'inplace')
|
||
|
|
||
|
if mgr is None:
|
||
|
mgr = self
|
||
|
|
||
|
# figure out our mask a-priori to avoid repeated replacements
|
||
|
values = self.as_array()
|
||
|
|
||
|
def comp(s):
|
||
|
if isna(s):
|
||
|
return isna(values)
|
||
|
return _maybe_compare(values, getattr(s, 'asm8', s), operator.eq)
|
||
|
|
||
|
masks = [comp(s) for i, s in enumerate(src_list)]
|
||
|
|
||
|
result_blocks = []
|
||
|
src_len = len(src_list) - 1
|
||
|
for blk in self.blocks:
|
||
|
|
||
|
# its possible to get multiple result blocks here
|
||
|
# replace ALWAYS will return a list
|
||
|
rb = [blk if inplace else blk.copy()]
|
||
|
for i, (s, d) in enumerate(zip(src_list, dest_list)):
|
||
|
new_rb = []
|
||
|
for b in rb:
|
||
|
if b.dtype == np.object_:
|
||
|
convert = i == src_len
|
||
|
result = b.replace(s, d, inplace=inplace, regex=regex,
|
||
|
mgr=mgr, convert=convert)
|
||
|
new_rb = _extend_blocks(result, new_rb)
|
||
|
else:
|
||
|
# get our mask for this element, sized to this
|
||
|
# particular block
|
||
|
m = masks[i][b.mgr_locs.indexer]
|
||
|
if m.any():
|
||
|
b = b.coerce_to_target_dtype(d)
|
||
|
new_rb.extend(b.putmask(m, d, inplace=True))
|
||
|
else:
|
||
|
new_rb.append(b)
|
||
|
rb = new_rb
|
||
|
result_blocks.extend(rb)
|
||
|
|
||
|
bm = self.__class__(result_blocks, self.axes)
|
||
|
bm._consolidate_inplace()
|
||
|
return bm
|
||
|
|
||
|
def reshape_nd(self, axes, **kwargs):
|
||
|
""" a 2d-nd reshape operation on a BlockManager """
|
||
|
return self.apply('reshape_nd', axes=axes, **kwargs)
|
||
|
|
||
|
def is_consolidated(self):
|
||
|
"""
|
||
|
Return True if more than one block with the same dtype
|
||
|
"""
|
||
|
if not self._known_consolidated:
|
||
|
self._consolidate_check()
|
||
|
return self._is_consolidated
|
||
|
|
||
|
def _consolidate_check(self):
|
||
|
ftypes = [blk.ftype for blk in self.blocks]
|
||
|
self._is_consolidated = len(ftypes) == len(set(ftypes))
|
||
|
self._known_consolidated = True
|
||
|
|
||
|
@property
|
||
|
def is_mixed_type(self):
|
||
|
# Warning, consolidation needs to get checked upstairs
|
||
|
self._consolidate_inplace()
|
||
|
return len(self.blocks) > 1
|
||
|
|
||
|
@property
|
||
|
def is_numeric_mixed_type(self):
|
||
|
# Warning, consolidation needs to get checked upstairs
|
||
|
self._consolidate_inplace()
|
||
|
return all(block.is_numeric for block in self.blocks)
|
||
|
|
||
|
@property
|
||
|
def is_datelike_mixed_type(self):
|
||
|
# Warning, consolidation needs to get checked upstairs
|
||
|
self._consolidate_inplace()
|
||
|
return any(block.is_datelike for block in self.blocks)
|
||
|
|
||
|
@property
|
||
|
def any_extension_types(self):
|
||
|
"""Whether any of the blocks in this manager are extension blocks"""
|
||
|
return any(block.is_extension for block in self.blocks)
|
||
|
|
||
|
@property
|
||
|
def is_view(self):
|
||
|
""" return a boolean if we are a single block and are a view """
|
||
|
if len(self.blocks) == 1:
|
||
|
return self.blocks[0].is_view
|
||
|
|
||
|
# It is technically possible to figure out which blocks are views
|
||
|
# e.g. [ b.values.base is not None for b in self.blocks ]
|
||
|
# but then we have the case of possibly some blocks being a view
|
||
|
# and some blocks not. setting in theory is possible on the non-view
|
||
|
# blocks w/o causing a SettingWithCopy raise/warn. But this is a bit
|
||
|
# complicated
|
||
|
|
||
|
return False
|
||
|
|
||
|
def get_bool_data(self, copy=False):
|
||
|
"""
|
||
|
Parameters
|
||
|
----------
|
||
|
copy : boolean, default False
|
||
|
Whether to copy the blocks
|
||
|
"""
|
||
|
self._consolidate_inplace()
|
||
|
return self.combine([b for b in self.blocks if b.is_bool], copy)
|
||
|
|
||
|
def get_numeric_data(self, copy=False):
|
||
|
"""
|
||
|
Parameters
|
||
|
----------
|
||
|
copy : boolean, default False
|
||
|
Whether to copy the blocks
|
||
|
"""
|
||
|
self._consolidate_inplace()
|
||
|
return self.combine([b for b in self.blocks if b.is_numeric], copy)
|
||
|
|
||
|
def combine(self, blocks, copy=True):
|
||
|
""" return a new manager with the blocks """
|
||
|
if len(blocks) == 0:
|
||
|
return self.make_empty()
|
||
|
|
||
|
# FIXME: optimization potential
|
||
|
indexer = np.sort(np.concatenate([b.mgr_locs.as_array
|
||
|
for b in blocks]))
|
||
|
inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])
|
||
|
|
||
|
new_blocks = []
|
||
|
for b in blocks:
|
||
|
b = b.copy(deep=copy)
|
||
|
b.mgr_locs = algos.take_1d(inv_indexer, b.mgr_locs.as_array,
|
||
|
axis=0, allow_fill=False)
|
||
|
new_blocks.append(b)
|
||
|
|
||
|
axes = list(self.axes)
|
||
|
axes[0] = self.items.take(indexer)
|
||
|
|
||
|
return self.__class__(new_blocks, axes, do_integrity_check=False)
|
||
|
|
||
|
def get_slice(self, slobj, axis=0):
|
||
|
if axis >= self.ndim:
|
||
|
raise IndexError("Requested axis not found in manager")
|
||
|
|
||
|
if axis == 0:
|
||
|
new_blocks = self._slice_take_blocks_ax0(slobj)
|
||
|
else:
|
||
|
slicer = [slice(None)] * (axis + 1)
|
||
|
slicer[axis] = slobj
|
||
|
slicer = tuple(slicer)
|
||
|
new_blocks = [blk.getitem_block(slicer) for blk in self.blocks]
|
||
|
|
||
|
new_axes = list(self.axes)
|
||
|
new_axes[axis] = new_axes[axis][slobj]
|
||
|
|
||
|
bm = self.__class__(new_blocks, new_axes, do_integrity_check=False)
|
||
|
bm._consolidate_inplace()
|
||
|
return bm
|
||
|
|
||
|
def __contains__(self, item):
|
||
|
return item in self.items
|
||
|
|
||
|
@property
|
||
|
def nblocks(self):
|
||
|
return len(self.blocks)
|
||
|
|
||
|
def copy(self, deep=True, mgr=None):
|
||
|
"""
|
||
|
Make deep or shallow copy of BlockManager
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
deep : boolean o rstring, default True
|
||
|
If False, return shallow copy (do not copy data)
|
||
|
If 'all', copy data and a deep copy of the index
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
copy : BlockManager
|
||
|
"""
|
||
|
|
||
|
# this preserves the notion of view copying of axes
|
||
|
if deep:
|
||
|
if deep == 'all':
|
||
|
copy = lambda ax: ax.copy(deep=True)
|
||
|
else:
|
||
|
copy = lambda ax: ax.view()
|
||
|
new_axes = [copy(ax) for ax in self.axes]
|
||
|
else:
|
||
|
new_axes = list(self.axes)
|
||
|
return self.apply('copy', axes=new_axes, deep=deep,
|
||
|
do_integrity_check=False)
|
||
|
|
||
|
def as_array(self, transpose=False, items=None):
|
||
|
"""Convert the blockmanager data into an numpy array.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
transpose : boolean, default False
|
||
|
If True, transpose the return array
|
||
|
items : list of strings or None
|
||
|
Names of block items that will be included in the returned
|
||
|
array. ``None`` means that all block items will be used
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
arr : ndarray
|
||
|
"""
|
||
|
if len(self.blocks) == 0:
|
||
|
arr = np.empty(self.shape, dtype=float)
|
||
|
return arr.transpose() if transpose else arr
|
||
|
|
||
|
if items is not None:
|
||
|
mgr = self.reindex_axis(items, axis=0)
|
||
|
else:
|
||
|
mgr = self
|
||
|
|
||
|
if self._is_single_block or not self.is_mixed_type:
|
||
|
arr = mgr.blocks[0].get_values()
|
||
|
else:
|
||
|
arr = mgr._interleave()
|
||
|
|
||
|
return arr.transpose() if transpose else arr
|
||
|
|
||
|
def _interleave(self):
|
||
|
"""
|
||
|
Return ndarray from blocks with specified item order
|
||
|
Items must be contained in the blocks
|
||
|
"""
|
||
|
dtype = _interleaved_dtype(self.blocks)
|
||
|
|
||
|
result = np.empty(self.shape, dtype=dtype)
|
||
|
|
||
|
if result.shape[0] == 0:
|
||
|
# Workaround for numpy 1.7 bug:
|
||
|
#
|
||
|
# >>> a = np.empty((0,10))
|
||
|
# >>> a[slice(0,0)]
|
||
|
# array([], shape=(0, 10), dtype=float64)
|
||
|
# >>> a[[]]
|
||
|
# Traceback (most recent call last):
|
||
|
# File "<stdin>", line 1, in <module>
|
||
|
# IndexError: index 0 is out of bounds for axis 0 with size 0
|
||
|
return result
|
||
|
|
||
|
itemmask = np.zeros(self.shape[0])
|
||
|
|
||
|
for blk in self.blocks:
|
||
|
rl = blk.mgr_locs
|
||
|
result[rl.indexer] = blk.get_values(dtype)
|
||
|
itemmask[rl.indexer] = 1
|
||
|
|
||
|
if not itemmask.all():
|
||
|
raise AssertionError('Some items were not contained in blocks')
|
||
|
|
||
|
return result
|
||
|
|
||
|
def to_dict(self, copy=True):
|
||
|
"""
|
||
|
Return a dict of str(dtype) -> BlockManager
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
copy : boolean, default True
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
values : a dict of dtype -> BlockManager
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
This consolidates based on str(dtype)
|
||
|
"""
|
||
|
self._consolidate_inplace()
|
||
|
|
||
|
bd = {}
|
||
|
for b in self.blocks:
|
||
|
bd.setdefault(str(b.dtype), []).append(b)
|
||
|
|
||
|
return {dtype: self.combine(blocks, copy=copy)
|
||
|
for dtype, blocks in bd.items()}
|
||
|
|
||
|
def xs(self, key, axis=1, copy=True, takeable=False):
|
||
|
if axis < 1:
|
||
|
raise AssertionError(
|
||
|
'Can only take xs across axis >= 1, got {ax}'.format(ax=axis))
|
||
|
|
||
|
# take by position
|
||
|
if takeable:
|
||
|
loc = key
|
||
|
else:
|
||
|
loc = self.axes[axis].get_loc(key)
|
||
|
|
||
|
slicer = [slice(None, None) for _ in range(self.ndim)]
|
||
|
slicer[axis] = loc
|
||
|
slicer = tuple(slicer)
|
||
|
|
||
|
new_axes = list(self.axes)
|
||
|
|
||
|
# could be an array indexer!
|
||
|
if isinstance(loc, (slice, np.ndarray)):
|
||
|
new_axes[axis] = new_axes[axis][loc]
|
||
|
else:
|
||
|
new_axes.pop(axis)
|
||
|
|
||
|
new_blocks = []
|
||
|
if len(self.blocks) > 1:
|
||
|
# we must copy here as we are mixed type
|
||
|
for blk in self.blocks:
|
||
|
newb = make_block(values=blk.values[slicer],
|
||
|
klass=blk.__class__,
|
||
|
placement=blk.mgr_locs)
|
||
|
new_blocks.append(newb)
|
||
|
elif len(self.blocks) == 1:
|
||
|
block = self.blocks[0]
|
||
|
vals = block.values[slicer]
|
||
|
if copy:
|
||
|
vals = vals.copy()
|
||
|
new_blocks = [make_block(values=vals,
|
||
|
placement=block.mgr_locs,
|
||
|
klass=block.__class__)]
|
||
|
|
||
|
return self.__class__(new_blocks, new_axes)
|
||
|
|
||
|
def fast_xs(self, loc):
|
||
|
"""
|
||
|
get a cross sectional for a given location in the
|
||
|
items ; handle dups
|
||
|
|
||
|
return the result, is *could* be a view in the case of a
|
||
|
single block
|
||
|
"""
|
||
|
if len(self.blocks) == 1:
|
||
|
return self.blocks[0].iget((slice(None), loc))
|
||
|
|
||
|
items = self.items
|
||
|
|
||
|
# non-unique (GH4726)
|
||
|
if not items.is_unique:
|
||
|
result = self._interleave()
|
||
|
if self.ndim == 2:
|
||
|
result = result.T
|
||
|
return result[loc]
|
||
|
|
||
|
# unique
|
||
|
dtype = _interleaved_dtype(self.blocks)
|
||
|
n = len(items)
|
||
|
result = np.empty(n, dtype=dtype)
|
||
|
for blk in self.blocks:
|
||
|
# Such assignment may incorrectly coerce NaT to None
|
||
|
# result[blk.mgr_locs] = blk._slice((slice(None), loc))
|
||
|
for i, rl in enumerate(blk.mgr_locs):
|
||
|
result[rl] = blk._try_coerce_result(blk.iget((i, loc)))
|
||
|
|
||
|
return result
|
||
|
|
||
|
def consolidate(self):
|
||
|
"""
|
||
|
Join together blocks having same dtype
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
y : BlockManager
|
||
|
"""
|
||
|
if self.is_consolidated():
|
||
|
return self
|
||
|
|
||
|
bm = self.__class__(self.blocks, self.axes)
|
||
|
bm._is_consolidated = False
|
||
|
bm._consolidate_inplace()
|
||
|
return bm
|
||
|
|
||
|
def _consolidate_inplace(self):
|
||
|
if not self.is_consolidated():
|
||
|
self.blocks = tuple(_consolidate(self.blocks))
|
||
|
self._is_consolidated = True
|
||
|
self._known_consolidated = True
|
||
|
self._rebuild_blknos_and_blklocs()
|
||
|
|
||
|
def get(self, item, fastpath=True):
|
||
|
"""
|
||
|
Return values for selected item (ndarray or BlockManager).
|
||
|
"""
|
||
|
if self.items.is_unique:
|
||
|
|
||
|
if not isna(item):
|
||
|
loc = self.items.get_loc(item)
|
||
|
else:
|
||
|
indexer = np.arange(len(self.items))[isna(self.items)]
|
||
|
|
||
|
# allow a single nan location indexer
|
||
|
if not is_scalar(indexer):
|
||
|
if len(indexer) == 1:
|
||
|
loc = indexer.item()
|
||
|
else:
|
||
|
raise ValueError("cannot label index with a null key")
|
||
|
|
||
|
return self.iget(loc, fastpath=fastpath)
|
||
|
else:
|
||
|
|
||
|
if isna(item):
|
||
|
raise TypeError("cannot label index with a null key")
|
||
|
|
||
|
indexer = self.items.get_indexer_for([item])
|
||
|
return self.reindex_indexer(new_axis=self.items[indexer],
|
||
|
indexer=indexer, axis=0,
|
||
|
allow_dups=True)
|
||
|
|
||
|
def iget(self, i, fastpath=True):
|
||
|
"""
|
||
|
Return the data as a SingleBlockManager if fastpath=True and possible
|
||
|
|
||
|
Otherwise return as a ndarray
|
||
|
"""
|
||
|
block = self.blocks[self._blknos[i]]
|
||
|
values = block.iget(self._blklocs[i])
|
||
|
if not fastpath or not block._box_to_block_values or values.ndim != 1:
|
||
|
return values
|
||
|
|
||
|
# fastpath shortcut for select a single-dim from a 2-dim BM
|
||
|
return SingleBlockManager(
|
||
|
[block.make_block_same_class(values,
|
||
|
placement=slice(0, len(values)),
|
||
|
ndim=1)],
|
||
|
self.axes[1])
|
||
|
|
||
|
def get_scalar(self, tup):
|
||
|
"""
|
||
|
Retrieve single item
|
||
|
"""
|
||
|
full_loc = [ax.get_loc(x) for ax, x in zip(self.axes, tup)]
|
||
|
blk = self.blocks[self._blknos[full_loc[0]]]
|
||
|
values = blk.values
|
||
|
|
||
|
# FIXME: this may return non-upcasted types?
|
||
|
if values.ndim == 1:
|
||
|
return values[full_loc[1]]
|
||
|
|
||
|
full_loc[0] = self._blklocs[full_loc[0]]
|
||
|
return values[tuple(full_loc)]
|
||
|
|
||
|
def delete(self, item):
|
||
|
"""
|
||
|
Delete selected item (items if non-unique) in-place.
|
||
|
"""
|
||
|
indexer = self.items.get_loc(item)
|
||
|
|
||
|
is_deleted = np.zeros(self.shape[0], dtype=np.bool_)
|
||
|
is_deleted[indexer] = True
|
||
|
ref_loc_offset = -is_deleted.cumsum()
|
||
|
|
||
|
is_blk_deleted = [False] * len(self.blocks)
|
||
|
|
||
|
if isinstance(indexer, int):
|
||
|
affected_start = indexer
|
||
|
else:
|
||
|
affected_start = is_deleted.nonzero()[0][0]
|
||
|
|
||
|
for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]):
|
||
|
blk = self.blocks[blkno]
|
||
|
bml = blk.mgr_locs
|
||
|
blk_del = is_deleted[bml.indexer].nonzero()[0]
|
||
|
|
||
|
if len(blk_del) == len(bml):
|
||
|
is_blk_deleted[blkno] = True
|
||
|
continue
|
||
|
elif len(blk_del) != 0:
|
||
|
blk.delete(blk_del)
|
||
|
bml = blk.mgr_locs
|
||
|
|
||
|
blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer])
|
||
|
|
||
|
# FIXME: use Index.delete as soon as it uses fastpath=True
|
||
|
self.axes[0] = self.items[~is_deleted]
|
||
|
self.blocks = tuple(b for blkno, b in enumerate(self.blocks)
|
||
|
if not is_blk_deleted[blkno])
|
||
|
self._shape = None
|
||
|
self._rebuild_blknos_and_blklocs()
|
||
|
|
||
|
def set(self, item, value, check=False):
|
||
|
"""
|
||
|
Set new item in-place. Does not consolidate. Adds new Block if not
|
||
|
contained in the current set of items
|
||
|
if check, then validate that we are not setting the same data in-place
|
||
|
"""
|
||
|
# FIXME: refactor, clearly separate broadcasting & zip-like assignment
|
||
|
# can prob also fix the various if tests for sparse/categorical
|
||
|
|
||
|
# TODO(EA): Remove an is_extension_ when all extension types satisfy
|
||
|
# the interface
|
||
|
value_is_extension_type = (is_extension_type(value) or
|
||
|
is_extension_array_dtype(value))
|
||
|
|
||
|
# categorical/spares/datetimetz
|
||
|
if value_is_extension_type:
|
||
|
|
||
|
def value_getitem(placement):
|
||
|
return value
|
||
|
else:
|
||
|
if value.ndim == self.ndim - 1:
|
||
|
value = _safe_reshape(value, (1,) + value.shape)
|
||
|
|
||
|
def value_getitem(placement):
|
||
|
return value
|
||
|
else:
|
||
|
|
||
|
def value_getitem(placement):
|
||
|
return value[placement.indexer]
|
||
|
|
||
|
if value.shape[1:] != self.shape[1:]:
|
||
|
raise AssertionError('Shape of new values must be compatible '
|
||
|
'with manager shape')
|
||
|
|
||
|
try:
|
||
|
loc = self.items.get_loc(item)
|
||
|
except KeyError:
|
||
|
# This item wasn't present, just insert at end
|
||
|
self.insert(len(self.items), item, value)
|
||
|
return
|
||
|
|
||
|
if isinstance(loc, int):
|
||
|
loc = [loc]
|
||
|
|
||
|
blknos = self._blknos[loc]
|
||
|
blklocs = self._blklocs[loc].copy()
|
||
|
|
||
|
unfit_mgr_locs = []
|
||
|
unfit_val_locs = []
|
||
|
removed_blknos = []
|
||
|
for blkno, val_locs in _get_blkno_placements(blknos, len(self.blocks),
|
||
|
group=True):
|
||
|
blk = self.blocks[blkno]
|
||
|
blk_locs = blklocs[val_locs.indexer]
|
||
|
if blk.should_store(value):
|
||
|
blk.set(blk_locs, value_getitem(val_locs), check=check)
|
||
|
else:
|
||
|
unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
|
||
|
unfit_val_locs.append(val_locs)
|
||
|
|
||
|
# If all block items are unfit, schedule the block for removal.
|
||
|
if len(val_locs) == len(blk.mgr_locs):
|
||
|
removed_blknos.append(blkno)
|
||
|
else:
|
||
|
self._blklocs[blk.mgr_locs.indexer] = -1
|
||
|
blk.delete(blk_locs)
|
||
|
self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk))
|
||
|
|
||
|
if len(removed_blknos):
|
||
|
# Remove blocks & update blknos accordingly
|
||
|
is_deleted = np.zeros(self.nblocks, dtype=np.bool_)
|
||
|
is_deleted[removed_blknos] = True
|
||
|
|
||
|
new_blknos = np.empty(self.nblocks, dtype=np.int64)
|
||
|
new_blknos.fill(-1)
|
||
|
new_blknos[~is_deleted] = np.arange(self.nblocks -
|
||
|
len(removed_blknos))
|
||
|
self._blknos = algos.take_1d(new_blknos, self._blknos, axis=0,
|
||
|
allow_fill=False)
|
||
|
self.blocks = tuple(blk for i, blk in enumerate(self.blocks)
|
||
|
if i not in set(removed_blknos))
|
||
|
|
||
|
if unfit_val_locs:
|
||
|
unfit_mgr_locs = np.concatenate(unfit_mgr_locs)
|
||
|
unfit_count = len(unfit_mgr_locs)
|
||
|
|
||
|
new_blocks = []
|
||
|
if value_is_extension_type:
|
||
|
# This code (ab-)uses the fact that sparse blocks contain only
|
||
|
# one item.
|
||
|
new_blocks.extend(
|
||
|
make_block(values=value.copy(), ndim=self.ndim,
|
||
|
placement=slice(mgr_loc, mgr_loc + 1))
|
||
|
for mgr_loc in unfit_mgr_locs)
|
||
|
|
||
|
self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) +
|
||
|
len(self.blocks))
|
||
|
self._blklocs[unfit_mgr_locs] = 0
|
||
|
|
||
|
else:
|
||
|
# unfit_val_locs contains BlockPlacement objects
|
||
|
unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])
|
||
|
|
||
|
new_blocks.append(
|
||
|
make_block(values=value_getitem(unfit_val_items),
|
||
|
ndim=self.ndim, placement=unfit_mgr_locs))
|
||
|
|
||
|
self._blknos[unfit_mgr_locs] = len(self.blocks)
|
||
|
self._blklocs[unfit_mgr_locs] = np.arange(unfit_count)
|
||
|
|
||
|
self.blocks += tuple(new_blocks)
|
||
|
|
||
|
# Newly created block's dtype may already be present.
|
||
|
self._known_consolidated = False
|
||
|
|
||
|
def insert(self, loc, item, value, allow_duplicates=False):
|
||
|
"""
|
||
|
Insert item at selected position.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
loc : int
|
||
|
item : hashable
|
||
|
value : array_like
|
||
|
allow_duplicates: bool
|
||
|
If False, trying to insert non-unique item will raise
|
||
|
|
||
|
"""
|
||
|
if not allow_duplicates and item in self.items:
|
||
|
# Should this be a different kind of error??
|
||
|
raise ValueError('cannot insert {}, already exists'.format(item))
|
||
|
|
||
|
if not isinstance(loc, int):
|
||
|
raise TypeError("loc must be int")
|
||
|
|
||
|
# insert to the axis; this could possibly raise a TypeError
|
||
|
new_axis = self.items.insert(loc, item)
|
||
|
|
||
|
block = make_block(values=value, ndim=self.ndim,
|
||
|
placement=slice(loc, loc + 1))
|
||
|
|
||
|
for blkno, count in _fast_count_smallints(self._blknos[loc:]):
|
||
|
blk = self.blocks[blkno]
|
||
|
if count == len(blk.mgr_locs):
|
||
|
blk.mgr_locs = blk.mgr_locs.add(1)
|
||
|
else:
|
||
|
new_mgr_locs = blk.mgr_locs.as_array.copy()
|
||
|
new_mgr_locs[new_mgr_locs >= loc] += 1
|
||
|
blk.mgr_locs = new_mgr_locs
|
||
|
|
||
|
if loc == self._blklocs.shape[0]:
|
||
|
# np.append is a lot faster (at least in numpy 1.7.1), let's use it
|
||
|
# if we can.
|
||
|
self._blklocs = np.append(self._blklocs, 0)
|
||
|
self._blknos = np.append(self._blknos, len(self.blocks))
|
||
|
else:
|
||
|
self._blklocs = np.insert(self._blklocs, loc, 0)
|
||
|
self._blknos = np.insert(self._blknos, loc, len(self.blocks))
|
||
|
|
||
|
self.axes[0] = new_axis
|
||
|
self.blocks += (block,)
|
||
|
self._shape = None
|
||
|
|
||
|
self._known_consolidated = False
|
||
|
|
||
|
if len(self.blocks) > 100:
|
||
|
self._consolidate_inplace()
|
||
|
|
||
|
def reindex_axis(self, new_index, axis, method=None, limit=None,
|
||
|
fill_value=None, copy=True):
|
||
|
"""
|
||
|
Conform block manager to new index.
|
||
|
"""
|
||
|
new_index = _ensure_index(new_index)
|
||
|
new_index, indexer = self.axes[axis].reindex(new_index, method=method,
|
||
|
limit=limit)
|
||
|
|
||
|
return self.reindex_indexer(new_index, indexer, axis=axis,
|
||
|
fill_value=fill_value, copy=copy)
|
||
|
|
||
|
def reindex_indexer(self, new_axis, indexer, axis, fill_value=None,
|
||
|
allow_dups=False, copy=True):
|
||
|
"""
|
||
|
Parameters
|
||
|
----------
|
||
|
new_axis : Index
|
||
|
indexer : ndarray of int64 or None
|
||
|
axis : int
|
||
|
fill_value : object
|
||
|
allow_dups : bool
|
||
|
|
||
|
pandas-indexer with -1's only.
|
||
|
"""
|
||
|
if indexer is None:
|
||
|
if new_axis is self.axes[axis] and not copy:
|
||
|
return self
|
||
|
|
||
|
result = self.copy(deep=copy)
|
||
|
result.axes = list(self.axes)
|
||
|
result.axes[axis] = new_axis
|
||
|
return result
|
||
|
|
||
|
self._consolidate_inplace()
|
||
|
|
||
|
# some axes don't allow reindexing with dups
|
||
|
if not allow_dups:
|
||
|
self.axes[axis]._can_reindex(indexer)
|
||
|
|
||
|
if axis >= self.ndim:
|
||
|
raise IndexError("Requested axis not found in manager")
|
||
|
|
||
|
if axis == 0:
|
||
|
new_blocks = self._slice_take_blocks_ax0(indexer,
|
||
|
fill_tuple=(fill_value,))
|
||
|
else:
|
||
|
new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=(
|
||
|
fill_value if fill_value is not None else blk.fill_value,))
|
||
|
for blk in self.blocks]
|
||
|
|
||
|
new_axes = list(self.axes)
|
||
|
new_axes[axis] = new_axis
|
||
|
return self.__class__(new_blocks, new_axes)
|
||
|
|
||
|
def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None):
|
||
|
"""
|
||
|
Slice/take blocks along axis=0.
|
||
|
|
||
|
Overloaded for SingleBlock
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
new_blocks : list of Block
|
||
|
|
||
|
"""
|
||
|
|
||
|
allow_fill = fill_tuple is not None
|
||
|
|
||
|
sl_type, slobj, sllen = _preprocess_slice_or_indexer(
|
||
|
slice_or_indexer, self.shape[0], allow_fill=allow_fill)
|
||
|
|
||
|
if self._is_single_block:
|
||
|
blk = self.blocks[0]
|
||
|
|
||
|
if sl_type in ('slice', 'mask'):
|
||
|
return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))]
|
||
|
elif not allow_fill or self.ndim == 1:
|
||
|
if allow_fill and fill_tuple[0] is None:
|
||
|
_, fill_value = maybe_promote(blk.dtype)
|
||
|
fill_tuple = (fill_value, )
|
||
|
|
||
|
return [blk.take_nd(slobj, axis=0,
|
||
|
new_mgr_locs=slice(0, sllen),
|
||
|
fill_tuple=fill_tuple)]
|
||
|
|
||
|
if sl_type in ('slice', 'mask'):
|
||
|
blknos = self._blknos[slobj]
|
||
|
blklocs = self._blklocs[slobj]
|
||
|
else:
|
||
|
blknos = algos.take_1d(self._blknos, slobj, fill_value=-1,
|
||
|
allow_fill=allow_fill)
|
||
|
blklocs = algos.take_1d(self._blklocs, slobj, fill_value=-1,
|
||
|
allow_fill=allow_fill)
|
||
|
|
||
|
# When filling blknos, make sure blknos is updated before appending to
|
||
|
# blocks list, that way new blkno is exactly len(blocks).
|
||
|
#
|
||
|
# FIXME: mgr_groupby_blknos must return mgr_locs in ascending order,
|
||
|
# pytables serialization will break otherwise.
|
||
|
blocks = []
|
||
|
for blkno, mgr_locs in _get_blkno_placements(blknos, len(self.blocks),
|
||
|
group=True):
|
||
|
if blkno == -1:
|
||
|
# If we've got here, fill_tuple was not None.
|
||
|
fill_value = fill_tuple[0]
|
||
|
|
||
|
blocks.append(self._make_na_block(placement=mgr_locs,
|
||
|
fill_value=fill_value))
|
||
|
else:
|
||
|
blk = self.blocks[blkno]
|
||
|
|
||
|
# Otherwise, slicing along items axis is necessary.
|
||
|
if not blk._can_consolidate:
|
||
|
# A non-consolidatable block, it's easy, because there's
|
||
|
# only one item and each mgr loc is a copy of that single
|
||
|
# item.
|
||
|
for mgr_loc in mgr_locs:
|
||
|
newblk = blk.copy(deep=True)
|
||
|
newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1)
|
||
|
blocks.append(newblk)
|
||
|
|
||
|
else:
|
||
|
blocks.append(blk.take_nd(blklocs[mgr_locs.indexer],
|
||
|
axis=0, new_mgr_locs=mgr_locs,
|
||
|
fill_tuple=None))
|
||
|
|
||
|
return blocks
|
||
|
|
||
|
def _make_na_block(self, placement, fill_value=None):
|
||
|
# TODO: infer dtypes other than float64 from fill_value
|
||
|
|
||
|
if fill_value is None:
|
||
|
fill_value = np.nan
|
||
|
block_shape = list(self.shape)
|
||
|
block_shape[0] = len(placement)
|
||
|
|
||
|
dtype, fill_value = infer_dtype_from_scalar(fill_value)
|
||
|
block_values = np.empty(block_shape, dtype=dtype)
|
||
|
block_values.fill(fill_value)
|
||
|
return make_block(block_values, placement=placement)
|
||
|
|
||
|
def take(self, indexer, axis=1, verify=True, convert=True):
|
||
|
"""
|
||
|
Take items along any axis.
|
||
|
"""
|
||
|
self._consolidate_inplace()
|
||
|
indexer = (np.arange(indexer.start, indexer.stop, indexer.step,
|
||
|
dtype='int64')
|
||
|
if isinstance(indexer, slice)
|
||
|
else np.asanyarray(indexer, dtype='int64'))
|
||
|
|
||
|
n = self.shape[axis]
|
||
|
if convert:
|
||
|
indexer = maybe_convert_indices(indexer, n)
|
||
|
|
||
|
if verify:
|
||
|
if ((indexer == -1) | (indexer >= n)).any():
|
||
|
raise Exception('Indices must be nonzero and less than '
|
||
|
'the axis length')
|
||
|
|
||
|
new_labels = self.axes[axis].take(indexer)
|
||
|
return self.reindex_indexer(new_axis=new_labels, indexer=indexer,
|
||
|
axis=axis, allow_dups=True)
|
||
|
|
||
|
def merge(self, other, lsuffix='', rsuffix=''):
|
||
|
if not self._is_indexed_like(other):
|
||
|
raise AssertionError('Must have same axes to merge managers')
|
||
|
|
||
|
l, r = items_overlap_with_suffix(left=self.items, lsuffix=lsuffix,
|
||
|
right=other.items, rsuffix=rsuffix)
|
||
|
new_items = _concat_indexes([l, r])
|
||
|
|
||
|
new_blocks = [blk.copy(deep=False) for blk in self.blocks]
|
||
|
|
||
|
offset = self.shape[0]
|
||
|
for blk in other.blocks:
|
||
|
blk = blk.copy(deep=False)
|
||
|
blk.mgr_locs = blk.mgr_locs.add(offset)
|
||
|
new_blocks.append(blk)
|
||
|
|
||
|
new_axes = list(self.axes)
|
||
|
new_axes[0] = new_items
|
||
|
|
||
|
return self.__class__(_consolidate(new_blocks), new_axes)
|
||
|
|
||
|
def _is_indexed_like(self, other):
|
||
|
"""
|
||
|
Check all axes except items
|
||
|
"""
|
||
|
if self.ndim != other.ndim:
|
||
|
raise AssertionError(
|
||
|
'Number of dimensions must agree got {ndim} and '
|
||
|
'{oth_ndim}'.format(ndim=self.ndim, oth_ndim=other.ndim))
|
||
|
for ax, oax in zip(self.axes[1:], other.axes[1:]):
|
||
|
if not ax.equals(oax):
|
||
|
return False
|
||
|
return True
|
||
|
|
||
|
def equals(self, other):
|
||
|
self_axes, other_axes = self.axes, other.axes
|
||
|
if len(self_axes) != len(other_axes):
|
||
|
return False
|
||
|
if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)):
|
||
|
return False
|
||
|
self._consolidate_inplace()
|
||
|
other._consolidate_inplace()
|
||
|
if len(self.blocks) != len(other.blocks):
|
||
|
return False
|
||
|
|
||
|
# canonicalize block order, using a tuple combining the type
|
||
|
# name and then mgr_locs because there might be unconsolidated
|
||
|
# blocks (say, Categorical) which can only be distinguished by
|
||
|
# the iteration order
|
||
|
def canonicalize(block):
|
||
|
return (block.dtype.name, block.mgr_locs.as_array.tolist())
|
||
|
|
||
|
self_blocks = sorted(self.blocks, key=canonicalize)
|
||
|
other_blocks = sorted(other.blocks, key=canonicalize)
|
||
|
return all(block.equals(oblock)
|
||
|
for block, oblock in zip(self_blocks, other_blocks))
|
||
|
|
||
|
def unstack(self, unstacker_func):
|
||
|
"""Return a blockmanager with all blocks unstacked.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
unstacker_func : callable
|
||
|
A (partially-applied) ``pd.core.reshape._Unstacker`` class.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
unstacked : BlockManager
|
||
|
"""
|
||
|
dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
|
||
|
new_columns = dummy.get_new_columns()
|
||
|
new_index = dummy.get_new_index()
|
||
|
new_blocks = []
|
||
|
columns_mask = []
|
||
|
|
||
|
for blk in self.blocks:
|
||
|
blocks, mask = blk._unstack(
|
||
|
partial(unstacker_func,
|
||
|
value_columns=self.items[blk.mgr_locs.indexer]),
|
||
|
new_columns)
|
||
|
|
||
|
new_blocks.extend(blocks)
|
||
|
columns_mask.extend(mask)
|
||
|
|
||
|
new_columns = new_columns[columns_mask]
|
||
|
|
||
|
bm = BlockManager(new_blocks, [new_columns, new_index])
|
||
|
return bm
|
||
|
|
||
|
|
||
|
class SingleBlockManager(BlockManager):
|
||
|
""" manage a single block with """
|
||
|
|
||
|
ndim = 1
|
||
|
_is_consolidated = True
|
||
|
_known_consolidated = True
|
||
|
__slots__ = ()
|
||
|
|
||
|
def __init__(self, block, axis, do_integrity_check=False, fastpath=False):
|
||
|
|
||
|
if isinstance(axis, list):
|
||
|
if len(axis) != 1:
|
||
|
raise ValueError("cannot create SingleBlockManager with more "
|
||
|
"than 1 axis")
|
||
|
axis = axis[0]
|
||
|
|
||
|
# passed from constructor, single block, single axis
|
||
|
if fastpath:
|
||
|
self.axes = [axis]
|
||
|
if isinstance(block, list):
|
||
|
|
||
|
# empty block
|
||
|
if len(block) == 0:
|
||
|
block = [np.array([])]
|
||
|
elif len(block) != 1:
|
||
|
raise ValueError('Cannot create SingleBlockManager with '
|
||
|
'more than 1 block')
|
||
|
block = block[0]
|
||
|
else:
|
||
|
self.axes = [_ensure_index(axis)]
|
||
|
|
||
|
# create the block here
|
||
|
if isinstance(block, list):
|
||
|
|
||
|
# provide consolidation to the interleaved_dtype
|
||
|
if len(block) > 1:
|
||
|
dtype = _interleaved_dtype(block)
|
||
|
block = [b.astype(dtype) for b in block]
|
||
|
block = _consolidate(block)
|
||
|
|
||
|
if len(block) != 1:
|
||
|
raise ValueError('Cannot create SingleBlockManager with '
|
||
|
'more than 1 block')
|
||
|
block = block[0]
|
||
|
|
||
|
if not isinstance(block, Block):
|
||
|
block = make_block(block, placement=slice(0, len(axis)), ndim=1)
|
||
|
|
||
|
self.blocks = [block]
|
||
|
|
||
|
def _post_setstate(self):
|
||
|
pass
|
||
|
|
||
|
@property
|
||
|
def _block(self):
|
||
|
return self.blocks[0]
|
||
|
|
||
|
@property
|
||
|
def _values(self):
|
||
|
return self._block.values
|
||
|
|
||
|
@property
|
||
|
def _blknos(self):
|
||
|
""" compat with BlockManager """
|
||
|
return None
|
||
|
|
||
|
@property
|
||
|
def _blklocs(self):
|
||
|
""" compat with BlockManager """
|
||
|
return None
|
||
|
|
||
|
def get_slice(self, slobj, axis=0):
|
||
|
if axis >= self.ndim:
|
||
|
raise IndexError("Requested axis not found in manager")
|
||
|
|
||
|
return self.__class__(self._block._slice(slobj),
|
||
|
self.index[slobj], fastpath=True)
|
||
|
|
||
|
@property
|
||
|
def index(self):
|
||
|
return self.axes[0]
|
||
|
|
||
|
def convert(self, **kwargs):
|
||
|
""" convert the whole block as one """
|
||
|
kwargs['by_item'] = False
|
||
|
return self.apply('convert', **kwargs)
|
||
|
|
||
|
@property
|
||
|
def dtype(self):
|
||
|
return self._block.dtype
|
||
|
|
||
|
@property
|
||
|
def array_dtype(self):
|
||
|
return self._block.array_dtype
|
||
|
|
||
|
@property
|
||
|
def ftype(self):
|
||
|
return self._block.ftype
|
||
|
|
||
|
def get_dtype_counts(self):
|
||
|
return {self.dtype.name: 1}
|
||
|
|
||
|
def get_ftype_counts(self):
|
||
|
return {self.ftype: 1}
|
||
|
|
||
|
def get_dtypes(self):
|
||
|
return np.array([self._block.dtype])
|
||
|
|
||
|
def get_ftypes(self):
|
||
|
return np.array([self._block.ftype])
|
||
|
|
||
|
def external_values(self):
|
||
|
return self._block.external_values()
|
||
|
|
||
|
def internal_values(self):
|
||
|
return self._block.internal_values()
|
||
|
|
||
|
def formatting_values(self):
|
||
|
"""Return the internal values used by the DataFrame/SeriesFormatter"""
|
||
|
return self._block.formatting_values()
|
||
|
|
||
|
def get_values(self):
|
||
|
""" return a dense type view """
|
||
|
return np.array(self._block.to_dense(), copy=False)
|
||
|
|
||
|
@property
|
||
|
def asobject(self):
|
||
|
"""
|
||
|
return a object dtype array. datetime/timedelta like values are boxed
|
||
|
to Timestamp/Timedelta instances.
|
||
|
"""
|
||
|
return self._block.get_values(dtype=object)
|
||
|
|
||
|
@property
|
||
|
def _can_hold_na(self):
|
||
|
return self._block._can_hold_na
|
||
|
|
||
|
def is_consolidated(self):
|
||
|
return True
|
||
|
|
||
|
def _consolidate_check(self):
|
||
|
pass
|
||
|
|
||
|
def _consolidate_inplace(self):
|
||
|
pass
|
||
|
|
||
|
def delete(self, item):
|
||
|
"""
|
||
|
Delete single item from SingleBlockManager.
|
||
|
|
||
|
Ensures that self.blocks doesn't become empty.
|
||
|
"""
|
||
|
loc = self.items.get_loc(item)
|
||
|
self._block.delete(loc)
|
||
|
self.axes[0] = self.axes[0].delete(loc)
|
||
|
|
||
|
def fast_xs(self, loc):
|
||
|
"""
|
||
|
fast path for getting a cross-section
|
||
|
return a view of the data
|
||
|
"""
|
||
|
return self._block.values[loc]
|
||
|
|
||
|
def concat(self, to_concat, new_axis):
|
||
|
"""
|
||
|
Concatenate a list of SingleBlockManagers into a single
|
||
|
SingleBlockManager.
|
||
|
|
||
|
Used for pd.concat of Series objects with axis=0.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
to_concat : list of SingleBlockManagers
|
||
|
new_axis : Index of the result
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
SingleBlockManager
|
||
|
|
||
|
"""
|
||
|
non_empties = [x for x in to_concat if len(x) > 0]
|
||
|
|
||
|
# check if all series are of the same block type:
|
||
|
if len(non_empties) > 0:
|
||
|
blocks = [obj.blocks[0] for obj in non_empties]
|
||
|
|
||
|
if all(type(b) is type(blocks[0]) for b in blocks[1:]): # noqa
|
||
|
new_block = blocks[0].concat_same_type(blocks)
|
||
|
else:
|
||
|
values = [x.values for x in blocks]
|
||
|
values = _concat._concat_compat(values)
|
||
|
new_block = make_block(
|
||
|
values, placement=slice(0, len(values), 1))
|
||
|
else:
|
||
|
values = [x._block.values for x in to_concat]
|
||
|
values = _concat._concat_compat(values)
|
||
|
new_block = make_block(
|
||
|
values, placement=slice(0, len(values), 1))
|
||
|
|
||
|
mgr = SingleBlockManager(new_block, new_axis)
|
||
|
return mgr
|
||
|
|
||
|
|
||
|
def construction_error(tot_items, block_shape, axes, e=None):
|
||
|
""" raise a helpful message about our construction """
|
||
|
passed = tuple(map(int, [tot_items] + list(block_shape)))
|
||
|
implied = tuple(map(int, [len(ax) for ax in axes]))
|
||
|
if passed == implied and e is not None:
|
||
|
raise e
|
||
|
if block_shape[0] == 0:
|
||
|
raise ValueError("Empty data passed with indices specified.")
|
||
|
raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
|
||
|
passed, implied))
|
||
|
|
||
|
|
||
|
def create_block_manager_from_blocks(blocks, axes):
|
||
|
try:
|
||
|
if len(blocks) == 1 and not isinstance(blocks[0], Block):
|
||
|
# if blocks[0] is of length 0, return empty blocks
|
||
|
if not len(blocks[0]):
|
||
|
blocks = []
|
||
|
else:
|
||
|
# It's OK if a single block is passed as values, its placement
|
||
|
# is basically "all items", but if there're many, don't bother
|
||
|
# converting, it's an error anyway.
|
||
|
blocks = [make_block(values=blocks[0],
|
||
|
placement=slice(0, len(axes[0])))]
|
||
|
|
||
|
mgr = BlockManager(blocks, axes)
|
||
|
mgr._consolidate_inplace()
|
||
|
return mgr
|
||
|
|
||
|
except (ValueError) as e:
|
||
|
blocks = [getattr(b, 'values', b) for b in blocks]
|
||
|
tot_items = sum(b.shape[0] for b in blocks)
|
||
|
construction_error(tot_items, blocks[0].shape[1:], axes, e)
|
||
|
|
||
|
|
||
|
def create_block_manager_from_arrays(arrays, names, axes):
|
||
|
|
||
|
try:
|
||
|
blocks = form_blocks(arrays, names, axes)
|
||
|
mgr = BlockManager(blocks, axes)
|
||
|
mgr._consolidate_inplace()
|
||
|
return mgr
|
||
|
except ValueError as e:
|
||
|
construction_error(len(arrays), arrays[0].shape, axes, e)
|
||
|
|
||
|
|
||
|
def form_blocks(arrays, names, axes):
|
||
|
# put "leftover" items in float bucket, where else?
|
||
|
# generalize?
|
||
|
items_dict = defaultdict(list)
|
||
|
extra_locs = []
|
||
|
|
||
|
names_idx = _ensure_index(names)
|
||
|
if names_idx.equals(axes[0]):
|
||
|
names_indexer = np.arange(len(names_idx))
|
||
|
else:
|
||
|
assert names_idx.intersection(axes[0]).is_unique
|
||
|
names_indexer = names_idx.get_indexer_for(axes[0])
|
||
|
|
||
|
for i, name_idx in enumerate(names_indexer):
|
||
|
if name_idx == -1:
|
||
|
extra_locs.append(i)
|
||
|
continue
|
||
|
|
||
|
k = names[name_idx]
|
||
|
v = arrays[name_idx]
|
||
|
|
||
|
block_type = get_block_type(v)
|
||
|
items_dict[block_type.__name__].append((i, k, v))
|
||
|
|
||
|
blocks = []
|
||
|
if len(items_dict['FloatBlock']):
|
||
|
float_blocks = _multi_blockify(items_dict['FloatBlock'])
|
||
|
blocks.extend(float_blocks)
|
||
|
|
||
|
if len(items_dict['ComplexBlock']):
|
||
|
complex_blocks = _multi_blockify(items_dict['ComplexBlock'])
|
||
|
blocks.extend(complex_blocks)
|
||
|
|
||
|
if len(items_dict['TimeDeltaBlock']):
|
||
|
timedelta_blocks = _multi_blockify(items_dict['TimeDeltaBlock'])
|
||
|
blocks.extend(timedelta_blocks)
|
||
|
|
||
|
if len(items_dict['IntBlock']):
|
||
|
int_blocks = _multi_blockify(items_dict['IntBlock'])
|
||
|
blocks.extend(int_blocks)
|
||
|
|
||
|
if len(items_dict['DatetimeBlock']):
|
||
|
datetime_blocks = _simple_blockify(items_dict['DatetimeBlock'],
|
||
|
_NS_DTYPE)
|
||
|
blocks.extend(datetime_blocks)
|
||
|
|
||
|
if len(items_dict['DatetimeTZBlock']):
|
||
|
dttz_blocks = [make_block(array,
|
||
|
klass=DatetimeTZBlock,
|
||
|
placement=[i])
|
||
|
for i, _, array in items_dict['DatetimeTZBlock']]
|
||
|
blocks.extend(dttz_blocks)
|
||
|
|
||
|
if len(items_dict['BoolBlock']):
|
||
|
bool_blocks = _simple_blockify(items_dict['BoolBlock'], np.bool_)
|
||
|
blocks.extend(bool_blocks)
|
||
|
|
||
|
if len(items_dict['ObjectBlock']) > 0:
|
||
|
object_blocks = _simple_blockify(items_dict['ObjectBlock'], np.object_)
|
||
|
blocks.extend(object_blocks)
|
||
|
|
||
|
if len(items_dict['SparseBlock']) > 0:
|
||
|
sparse_blocks = _sparse_blockify(items_dict['SparseBlock'])
|
||
|
blocks.extend(sparse_blocks)
|
||
|
|
||
|
if len(items_dict['CategoricalBlock']) > 0:
|
||
|
cat_blocks = [make_block(array, klass=CategoricalBlock, placement=[i])
|
||
|
for i, _, array in items_dict['CategoricalBlock']]
|
||
|
blocks.extend(cat_blocks)
|
||
|
|
||
|
if len(items_dict['ExtensionBlock']):
|
||
|
|
||
|
external_blocks = [
|
||
|
make_block(array, klass=ExtensionBlock, placement=[i])
|
||
|
for i, _, array in items_dict['ExtensionBlock']
|
||
|
]
|
||
|
|
||
|
blocks.extend(external_blocks)
|
||
|
|
||
|
if len(extra_locs):
|
||
|
shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:])
|
||
|
|
||
|
# empty items -> dtype object
|
||
|
block_values = np.empty(shape, dtype=object)
|
||
|
block_values.fill(np.nan)
|
||
|
|
||
|
na_block = make_block(block_values, placement=extra_locs)
|
||
|
blocks.append(na_block)
|
||
|
|
||
|
return blocks
|
||
|
|
||
|
|
||
|
def _simple_blockify(tuples, dtype):
|
||
|
""" return a single array of a block that has a single dtype; if dtype is
|
||
|
not None, coerce to this dtype
|
||
|
"""
|
||
|
values, placement = _stack_arrays(tuples, dtype)
|
||
|
|
||
|
# CHECK DTYPE?
|
||
|
if dtype is not None and values.dtype != dtype: # pragma: no cover
|
||
|
values = values.astype(dtype)
|
||
|
|
||
|
block = make_block(values, placement=placement)
|
||
|
return [block]
|
||
|
|
||
|
|
||
|
def _multi_blockify(tuples, dtype=None):
|
||
|
""" return an array of blocks that potentially have different dtypes """
|
||
|
|
||
|
# group by dtype
|
||
|
grouper = itertools.groupby(tuples, lambda x: x[2].dtype)
|
||
|
|
||
|
new_blocks = []
|
||
|
for dtype, tup_block in grouper:
|
||
|
|
||
|
values, placement = _stack_arrays(list(tup_block), dtype)
|
||
|
|
||
|
block = make_block(values, placement=placement)
|
||
|
new_blocks.append(block)
|
||
|
|
||
|
return new_blocks
|
||
|
|
||
|
|
||
|
def _sparse_blockify(tuples, dtype=None):
|
||
|
""" return an array of blocks that potentially have different dtypes (and
|
||
|
are sparse)
|
||
|
"""
|
||
|
|
||
|
new_blocks = []
|
||
|
for i, names, array in tuples:
|
||
|
array = _maybe_to_sparse(array)
|
||
|
block = make_block(array, klass=SparseBlock, placement=[i])
|
||
|
new_blocks.append(block)
|
||
|
|
||
|
return new_blocks
|
||
|
|
||
|
|
||
|
def _stack_arrays(tuples, dtype):
|
||
|
|
||
|
# fml
|
||
|
def _asarray_compat(x):
|
||
|
if isinstance(x, ABCSeries):
|
||
|
return x._values
|
||
|
else:
|
||
|
return np.asarray(x)
|
||
|
|
||
|
def _shape_compat(x):
|
||
|
if isinstance(x, ABCSeries):
|
||
|
return len(x),
|
||
|
else:
|
||
|
return x.shape
|
||
|
|
||
|
placement, names, arrays = zip(*tuples)
|
||
|
|
||
|
first = arrays[0]
|
||
|
shape = (len(arrays),) + _shape_compat(first)
|
||
|
|
||
|
stacked = np.empty(shape, dtype=dtype)
|
||
|
for i, arr in enumerate(arrays):
|
||
|
stacked[i] = _asarray_compat(arr)
|
||
|
|
||
|
return stacked, placement
|
||
|
|
||
|
|
||
|
def _interleaved_dtype(blocks):
|
||
|
if not len(blocks):
|
||
|
return None
|
||
|
|
||
|
dtype = find_common_type([b.dtype for b in blocks])
|
||
|
|
||
|
# only numpy compat
|
||
|
if isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)):
|
||
|
dtype = np.object
|
||
|
|
||
|
return dtype
|
||
|
|
||
|
|
||
|
def _consolidate(blocks):
|
||
|
"""
|
||
|
Merge blocks having same dtype, exclude non-consolidating blocks
|
||
|
"""
|
||
|
|
||
|
# sort by _can_consolidate, dtype
|
||
|
gkey = lambda x: x._consolidate_key
|
||
|
grouper = itertools.groupby(sorted(blocks, key=gkey), gkey)
|
||
|
|
||
|
new_blocks = []
|
||
|
for (_can_consolidate, dtype), group_blocks in grouper:
|
||
|
merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype,
|
||
|
_can_consolidate=_can_consolidate)
|
||
|
new_blocks = _extend_blocks(merged_blocks, new_blocks)
|
||
|
return new_blocks
|
||
|
|
||
|
|
||
|
def _merge_blocks(blocks, dtype=None, _can_consolidate=True):
|
||
|
|
||
|
if len(blocks) == 1:
|
||
|
return blocks[0]
|
||
|
|
||
|
if _can_consolidate:
|
||
|
|
||
|
if dtype is None:
|
||
|
if len({b.dtype for b in blocks}) != 1:
|
||
|
raise AssertionError("_merge_blocks are invalid!")
|
||
|
dtype = blocks[0].dtype
|
||
|
|
||
|
# FIXME: optimization potential in case all mgrs contain slices and
|
||
|
# combination of those slices is a slice, too.
|
||
|
new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])
|
||
|
new_values = _vstack([b.values for b in blocks], dtype)
|
||
|
|
||
|
argsort = np.argsort(new_mgr_locs)
|
||
|
new_values = new_values[argsort]
|
||
|
new_mgr_locs = new_mgr_locs[argsort]
|
||
|
|
||
|
return make_block(new_values, placement=new_mgr_locs)
|
||
|
|
||
|
# no merge
|
||
|
return blocks
|
||
|
|
||
|
|
||
|
def _extend_blocks(result, blocks=None):
|
||
|
""" return a new extended blocks, givin the result """
|
||
|
if blocks is None:
|
||
|
blocks = []
|
||
|
if isinstance(result, list):
|
||
|
for r in result:
|
||
|
if isinstance(r, list):
|
||
|
blocks.extend(r)
|
||
|
else:
|
||
|
blocks.append(r)
|
||
|
elif isinstance(result, BlockManager):
|
||
|
blocks.extend(result.blocks)
|
||
|
else:
|
||
|
blocks.append(result)
|
||
|
return blocks
|
||
|
|
||
|
|
||
|
def _block_shape(values, ndim=1, shape=None):
|
||
|
""" guarantee the shape of the values to be at least 1 d """
|
||
|
if values.ndim < ndim:
|
||
|
if shape is None:
|
||
|
shape = values.shape
|
||
|
values = values.reshape(tuple((1, ) + shape))
|
||
|
return values
|
||
|
|
||
|
|
||
|
def _vstack(to_stack, dtype):
|
||
|
|
||
|
# work around NumPy 1.6 bug
|
||
|
if dtype == _NS_DTYPE or dtype == _TD_DTYPE:
|
||
|
new_values = np.vstack([x.view('i8') for x in to_stack])
|
||
|
return new_values.view(dtype)
|
||
|
|
||
|
else:
|
||
|
return np.vstack(to_stack)
|
||
|
|
||
|
|
||
|
def _maybe_compare(a, b, op):
|
||
|
|
||
|
is_a_array = isinstance(a, np.ndarray)
|
||
|
is_b_array = isinstance(b, np.ndarray)
|
||
|
|
||
|
# numpy deprecation warning to have i8 vs integer comparisons
|
||
|
if is_datetimelike_v_numeric(a, b):
|
||
|
result = False
|
||
|
|
||
|
# numpy deprecation warning if comparing numeric vs string-like
|
||
|
elif is_numeric_v_string_like(a, b):
|
||
|
result = False
|
||
|
|
||
|
else:
|
||
|
result = op(a, b)
|
||
|
|
||
|
if is_scalar(result) and (is_a_array or is_b_array):
|
||
|
type_names = [type(a).__name__, type(b).__name__]
|
||
|
|
||
|
if is_a_array:
|
||
|
type_names[0] = 'ndarray(dtype={dtype})'.format(dtype=a.dtype)
|
||
|
|
||
|
if is_b_array:
|
||
|
type_names[1] = 'ndarray(dtype={dtype})'.format(dtype=b.dtype)
|
||
|
|
||
|
raise TypeError(
|
||
|
"Cannot compare types {a!r} and {b!r}".format(a=type_names[0],
|
||
|
b=type_names[1]))
|
||
|
return result
|
||
|
|
||
|
|
||
|
def _concat_indexes(indexes):
|
||
|
return indexes[0].append(indexes[1:])
|
||
|
|
||
|
|
||
|
def _block2d_to_blocknd(values, placement, shape, labels, ref_items):
|
||
|
""" pivot to the labels shape """
|
||
|
panel_shape = (len(placement),) + shape
|
||
|
|
||
|
# TODO: lexsort depth needs to be 2!!
|
||
|
|
||
|
# Create observation selection vector using major and minor
|
||
|
# labels, for converting to panel format.
|
||
|
selector = _factor_indexer(shape[1:], labels)
|
||
|
mask = np.zeros(np.prod(shape), dtype=bool)
|
||
|
mask.put(selector, True)
|
||
|
|
||
|
if mask.all():
|
||
|
pvalues = np.empty(panel_shape, dtype=values.dtype)
|
||
|
else:
|
||
|
dtype, fill_value = maybe_promote(values.dtype)
|
||
|
pvalues = np.empty(panel_shape, dtype=dtype)
|
||
|
pvalues.fill(fill_value)
|
||
|
|
||
|
for i in range(len(placement)):
|
||
|
pvalues[i].flat[mask] = values[:, i]
|
||
|
|
||
|
return make_block(pvalues, placement=placement)
|
||
|
|
||
|
|
||
|
def _factor_indexer(shape, labels):
|
||
|
"""
|
||
|
given a tuple of shape and a list of Categorical labels, return the
|
||
|
expanded label indexer
|
||
|
"""
|
||
|
mult = np.array(shape)[::-1].cumprod()[::-1]
|
||
|
return _ensure_platform_int(
|
||
|
np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T)
|
||
|
|
||
|
|
||
|
def _get_blkno_placements(blknos, blk_count, group=True):
|
||
|
"""
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
blknos : array of int64
|
||
|
blk_count : int
|
||
|
group : bool
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
iterator
|
||
|
yield (BlockPlacement, blkno)
|
||
|
|
||
|
"""
|
||
|
|
||
|
blknos = _ensure_int64(blknos)
|
||
|
|
||
|
# FIXME: blk_count is unused, but it may avoid the use of dicts in cython
|
||
|
for blkno, indexer in libinternals.get_blkno_indexers(blknos, group):
|
||
|
yield blkno, BlockPlacement(indexer)
|
||
|
|
||
|
|
||
|
def items_overlap_with_suffix(left, lsuffix, right, rsuffix):
|
||
|
"""
|
||
|
If two indices overlap, add suffixes to overlapping entries.
|
||
|
|
||
|
If corresponding suffix is empty, the entry is simply converted to string.
|
||
|
|
||
|
"""
|
||
|
to_rename = left.intersection(right)
|
||
|
if len(to_rename) == 0:
|
||
|
return left, right
|
||
|
else:
|
||
|
if not lsuffix and not rsuffix:
|
||
|
raise ValueError('columns overlap but no suffix specified: '
|
||
|
'{rename}'.format(rename=to_rename))
|
||
|
|
||
|
def lrenamer(x):
|
||
|
if x in to_rename:
|
||
|
return '{x}{lsuffix}'.format(x=x, lsuffix=lsuffix)
|
||
|
return x
|
||
|
|
||
|
def rrenamer(x):
|
||
|
if x in to_rename:
|
||
|
return '{x}{rsuffix}'.format(x=x, rsuffix=rsuffix)
|
||
|
return x
|
||
|
|
||
|
return (_transform_index(left, lrenamer),
|
||
|
_transform_index(right, rrenamer))
|
||
|
|
||
|
|
||
|
def _safe_reshape(arr, new_shape):
|
||
|
"""
|
||
|
If possible, reshape `arr` to have shape `new_shape`,
|
||
|
with a couple of exceptions (see gh-13012):
|
||
|
|
||
|
1) If `arr` is a ExtensionArray or Index, `arr` will be
|
||
|
returned as is.
|
||
|
2) If `arr` is a Series, the `_values` attribute will
|
||
|
be reshaped and returned.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
arr : array-like, object to be reshaped
|
||
|
new_shape : int or tuple of ints, the new shape
|
||
|
"""
|
||
|
if isinstance(arr, ABCSeries):
|
||
|
arr = arr._values
|
||
|
if not isinstance(arr, ABCExtensionArray):
|
||
|
arr = arr.reshape(new_shape)
|
||
|
return arr
|
||
|
|
||
|
|
||
|
def _transform_index(index, func, level=None):
|
||
|
"""
|
||
|
Apply function to all values found in index.
|
||
|
|
||
|
This includes transforming multiindex entries separately.
|
||
|
Only apply function to one level of the MultiIndex if level is specified.
|
||
|
|
||
|
"""
|
||
|
if isinstance(index, MultiIndex):
|
||
|
if level is not None:
|
||
|
items = [tuple(func(y) if i == level else y
|
||
|
for i, y in enumerate(x)) for x in index]
|
||
|
else:
|
||
|
items = [tuple(func(y) for y in x) for x in index]
|
||
|
return MultiIndex.from_tuples(items, names=index.names)
|
||
|
else:
|
||
|
items = [func(x) for x in index]
|
||
|
return Index(items, name=index.name, tupleize_cols=False)
|
||
|
|
||
|
|
||
|
def _putmask_smart(v, m, n):
|
||
|
"""
|
||
|
Return a new ndarray, try to preserve dtype if possible.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
v : `values`, updated in-place (array like)
|
||
|
m : `mask`, applies to both sides (array like)
|
||
|
n : `new values` either scalar or an array like aligned with `values`
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
values : ndarray with updated values
|
||
|
this *may* be a copy of the original
|
||
|
|
||
|
See Also
|
||
|
--------
|
||
|
ndarray.putmask
|
||
|
"""
|
||
|
|
||
|
# we cannot use np.asarray() here as we cannot have conversions
|
||
|
# that numpy does when numeric are mixed with strings
|
||
|
|
||
|
# n should be the length of the mask or a scalar here
|
||
|
if not is_list_like(n):
|
||
|
n = np.repeat(n, len(m))
|
||
|
elif isinstance(n, np.ndarray) and n.ndim == 0: # numpy scalar
|
||
|
n = np.repeat(np.array(n, ndmin=1), len(m))
|
||
|
|
||
|
# see if we are only masking values that if putted
|
||
|
# will work in the current dtype
|
||
|
try:
|
||
|
nn = n[m]
|
||
|
|
||
|
# make sure that we have a nullable type
|
||
|
# if we have nulls
|
||
|
if not _isna_compat(v, nn[0]):
|
||
|
raise ValueError
|
||
|
|
||
|
# we ignore ComplexWarning here
|
||
|
with catch_warnings(record=True):
|
||
|
nn_at = nn.astype(v.dtype)
|
||
|
|
||
|
# avoid invalid dtype comparisons
|
||
|
# between numbers & strings
|
||
|
|
||
|
# only compare integers/floats
|
||
|
# don't compare integers to datetimelikes
|
||
|
if (not is_numeric_v_string_like(nn, nn_at) and
|
||
|
(is_float_dtype(nn.dtype) or
|
||
|
is_integer_dtype(nn.dtype) and
|
||
|
is_float_dtype(nn_at.dtype) or
|
||
|
is_integer_dtype(nn_at.dtype))):
|
||
|
|
||
|
comp = (nn == nn_at)
|
||
|
if is_list_like(comp) and comp.all():
|
||
|
nv = v.copy()
|
||
|
nv[m] = nn_at
|
||
|
return nv
|
||
|
except (ValueError, IndexError, TypeError):
|
||
|
pass
|
||
|
|
||
|
n = np.asarray(n)
|
||
|
|
||
|
def _putmask_preserve(nv, n):
|
||
|
try:
|
||
|
nv[m] = n[m]
|
||
|
except (IndexError, ValueError):
|
||
|
nv[m] = n
|
||
|
return nv
|
||
|
|
||
|
# preserves dtype if possible
|
||
|
if v.dtype.kind == n.dtype.kind:
|
||
|
return _putmask_preserve(v, n)
|
||
|
|
||
|
# change the dtype if needed
|
||
|
dtype, _ = maybe_promote(n.dtype)
|
||
|
|
||
|
if is_extension_type(v.dtype) and is_object_dtype(dtype):
|
||
|
v = v.get_values(dtype)
|
||
|
else:
|
||
|
v = v.astype(dtype)
|
||
|
|
||
|
return _putmask_preserve(v, n)
|
||
|
|
||
|
|
||
|
def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
|
||
|
"""
|
||
|
Concatenate block managers into one.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples
|
||
|
axes : list of Index
|
||
|
concat_axis : int
|
||
|
copy : bool
|
||
|
|
||
|
"""
|
||
|
concat_plan = combine_concat_plans(
|
||
|
[get_mgr_concatenation_plan(mgr, indexers)
|
||
|
for mgr, indexers in mgrs_indexers], concat_axis)
|
||
|
|
||
|
blocks = []
|
||
|
|
||
|
for placement, join_units in concat_plan:
|
||
|
|
||
|
if len(join_units) == 1 and not join_units[0].indexers:
|
||
|
b = join_units[0].block
|
||
|
values = b.values
|
||
|
if copy:
|
||
|
values = values.copy()
|
||
|
elif not copy:
|
||
|
values = values.view()
|
||
|
b = b.make_block_same_class(values, placement=placement)
|
||
|
elif is_uniform_join_units(join_units):
|
||
|
b = join_units[0].block.concat_same_type(
|
||
|
[ju.block for ju in join_units], placement=placement)
|
||
|
else:
|
||
|
b = make_block(
|
||
|
concatenate_join_units(join_units, concat_axis, copy=copy),
|
||
|
placement=placement)
|
||
|
blocks.append(b)
|
||
|
|
||
|
return BlockManager(blocks, axes)
|
||
|
|
||
|
|
||
|
def is_uniform_join_units(join_units):
|
||
|
"""
|
||
|
Check if the join units consist of blocks of uniform type that can
|
||
|
be concatenated using Block.concat_same_type instead of the generic
|
||
|
concatenate_join_units (which uses `_concat._concat_compat`).
|
||
|
|
||
|
"""
|
||
|
return (
|
||
|
# all blocks need to have the same type
|
||
|
all(type(ju.block) is type(join_units[0].block) for ju in join_units) and # noqa
|
||
|
# no blocks that would get missing values (can lead to type upcasts)
|
||
|
# unless we're an extension dtype.
|
||
|
all(not ju.is_na or ju.block.is_extension for ju in join_units) and
|
||
|
# no blocks with indexers (as then the dimensions do not fit)
|
||
|
all(not ju.indexers for ju in join_units) and
|
||
|
# disregard Panels
|
||
|
all(ju.block.ndim <= 2 for ju in join_units) and
|
||
|
# only use this path when there is something to concatenate
|
||
|
len(join_units) > 1)
|
||
|
|
||
|
|
||
|
def is_uniform_reindex(join_units):
|
||
|
return (
|
||
|
# TODO: should this be ju.block._can_hold_na?
|
||
|
all(ju.block and ju.block.is_extension for ju in join_units) and
|
||
|
len(set(ju.block.dtype.name for ju in join_units)) == 1
|
||
|
)
|
||
|
|
||
|
|
||
|
def get_empty_dtype_and_na(join_units):
|
||
|
"""
|
||
|
Return dtype and N/A values to use when concatenating specified units.
|
||
|
|
||
|
Returned N/A value may be None which means there was no casting involved.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
dtype
|
||
|
na
|
||
|
"""
|
||
|
|
||
|
if len(join_units) == 1:
|
||
|
blk = join_units[0].block
|
||
|
if blk is None:
|
||
|
return np.float64, np.nan
|
||
|
|
||
|
if is_uniform_reindex(join_units):
|
||
|
# XXX: integrate property
|
||
|
empty_dtype = join_units[0].block.dtype
|
||
|
upcasted_na = join_units[0].block.fill_value
|
||
|
return empty_dtype, upcasted_na
|
||
|
|
||
|
has_none_blocks = False
|
||
|
dtypes = [None] * len(join_units)
|
||
|
for i, unit in enumerate(join_units):
|
||
|
if unit.block is None:
|
||
|
has_none_blocks = True
|
||
|
else:
|
||
|
dtypes[i] = unit.dtype
|
||
|
|
||
|
upcast_classes = defaultdict(list)
|
||
|
null_upcast_classes = defaultdict(list)
|
||
|
for dtype, unit in zip(dtypes, join_units):
|
||
|
if dtype is None:
|
||
|
continue
|
||
|
|
||
|
if is_categorical_dtype(dtype):
|
||
|
upcast_cls = 'category'
|
||
|
elif is_datetimetz(dtype):
|
||
|
upcast_cls = 'datetimetz'
|
||
|
elif issubclass(dtype.type, np.bool_):
|
||
|
upcast_cls = 'bool'
|
||
|
elif issubclass(dtype.type, np.object_):
|
||
|
upcast_cls = 'object'
|
||
|
elif is_datetime64_dtype(dtype):
|
||
|
upcast_cls = 'datetime'
|
||
|
elif is_timedelta64_dtype(dtype):
|
||
|
upcast_cls = 'timedelta'
|
||
|
elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
|
||
|
upcast_cls = dtype.name
|
||
|
else:
|
||
|
upcast_cls = 'float'
|
||
|
|
||
|
# Null blocks should not influence upcast class selection, unless there
|
||
|
# are only null blocks, when same upcasting rules must be applied to
|
||
|
# null upcast classes.
|
||
|
if unit.is_na:
|
||
|
null_upcast_classes[upcast_cls].append(dtype)
|
||
|
else:
|
||
|
upcast_classes[upcast_cls].append(dtype)
|
||
|
|
||
|
if not upcast_classes:
|
||
|
upcast_classes = null_upcast_classes
|
||
|
|
||
|
# create the result
|
||
|
if 'object' in upcast_classes:
|
||
|
return np.dtype(np.object_), np.nan
|
||
|
elif 'bool' in upcast_classes:
|
||
|
if has_none_blocks:
|
||
|
return np.dtype(np.object_), np.nan
|
||
|
else:
|
||
|
return np.dtype(np.bool_), None
|
||
|
elif 'category' in upcast_classes:
|
||
|
return np.dtype(np.object_), np.nan
|
||
|
elif 'datetimetz' in upcast_classes:
|
||
|
dtype = upcast_classes['datetimetz']
|
||
|
return dtype[0], tslib.iNaT
|
||
|
elif 'datetime' in upcast_classes:
|
||
|
return np.dtype('M8[ns]'), tslib.iNaT
|
||
|
elif 'timedelta' in upcast_classes:
|
||
|
return np.dtype('m8[ns]'), tslib.iNaT
|
||
|
else: # pragma
|
||
|
g = np.find_common_type(upcast_classes, [])
|
||
|
if is_float_dtype(g):
|
||
|
return g, g.type(np.nan)
|
||
|
elif is_numeric_dtype(g):
|
||
|
if has_none_blocks:
|
||
|
return np.float64, np.nan
|
||
|
else:
|
||
|
return g, None
|
||
|
|
||
|
msg = "invalid dtype determination in get_concat_dtype"
|
||
|
raise AssertionError(msg)
|
||
|
|
||
|
|
||
|
def concatenate_join_units(join_units, concat_axis, copy):
|
||
|
"""
|
||
|
Concatenate values from several join units along selected axis.
|
||
|
"""
|
||
|
if concat_axis == 0 and len(join_units) > 1:
|
||
|
# Concatenating join units along ax0 is handled in _merge_blocks.
|
||
|
raise AssertionError("Concatenating join units along axis0")
|
||
|
|
||
|
empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units)
|
||
|
|
||
|
to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype,
|
||
|
upcasted_na=upcasted_na)
|
||
|
for ju in join_units]
|
||
|
|
||
|
if len(to_concat) == 1:
|
||
|
# Only one block, nothing to concatenate.
|
||
|
concat_values = to_concat[0]
|
||
|
if copy:
|
||
|
if isinstance(concat_values, np.ndarray):
|
||
|
# non-reindexed (=not yet copied) arrays are made into a view
|
||
|
# in JoinUnit.get_reindexed_values
|
||
|
if concat_values.base is not None:
|
||
|
concat_values = concat_values.copy()
|
||
|
else:
|
||
|
concat_values = concat_values.copy()
|
||
|
else:
|
||
|
concat_values = _concat._concat_compat(to_concat, axis=concat_axis)
|
||
|
|
||
|
return concat_values
|
||
|
|
||
|
|
||
|
def get_mgr_concatenation_plan(mgr, indexers):
|
||
|
"""
|
||
|
Construct concatenation plan for given block manager and indexers.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
mgr : BlockManager
|
||
|
indexers : dict of {axis: indexer}
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
plan : list of (BlockPlacement, JoinUnit) tuples
|
||
|
|
||
|
"""
|
||
|
# Calculate post-reindex shape , save for item axis which will be separate
|
||
|
# for each block anyway.
|
||
|
mgr_shape = list(mgr.shape)
|
||
|
for ax, indexer in indexers.items():
|
||
|
mgr_shape[ax] = len(indexer)
|
||
|
mgr_shape = tuple(mgr_shape)
|
||
|
|
||
|
if 0 in indexers:
|
||
|
ax0_indexer = indexers.pop(0)
|
||
|
blknos = algos.take_1d(mgr._blknos, ax0_indexer, fill_value=-1)
|
||
|
blklocs = algos.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1)
|
||
|
else:
|
||
|
|
||
|
if mgr._is_single_block:
|
||
|
blk = mgr.blocks[0]
|
||
|
return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))]
|
||
|
|
||
|
ax0_indexer = None
|
||
|
blknos = mgr._blknos
|
||
|
blklocs = mgr._blklocs
|
||
|
|
||
|
plan = []
|
||
|
for blkno, placements in _get_blkno_placements(blknos, len(mgr.blocks),
|
||
|
group=False):
|
||
|
|
||
|
assert placements.is_slice_like
|
||
|
|
||
|
join_unit_indexers = indexers.copy()
|
||
|
|
||
|
shape = list(mgr_shape)
|
||
|
shape[0] = len(placements)
|
||
|
shape = tuple(shape)
|
||
|
|
||
|
if blkno == -1:
|
||
|
unit = JoinUnit(None, shape)
|
||
|
else:
|
||
|
blk = mgr.blocks[blkno]
|
||
|
ax0_blk_indexer = blklocs[placements.indexer]
|
||
|
|
||
|
unit_no_ax0_reindexing = (len(placements) == len(blk.mgr_locs) and
|
||
|
# Fastpath detection of join unit not
|
||
|
# needing to reindex its block: no ax0
|
||
|
# reindexing took place and block
|
||
|
# placement was sequential before.
|
||
|
((ax0_indexer is None and
|
||
|
blk.mgr_locs.is_slice_like and
|
||
|
blk.mgr_locs.as_slice.step == 1) or
|
||
|
# Slow-ish detection: all indexer locs
|
||
|
# are sequential (and length match is
|
||
|
# checked above).
|
||
|
(np.diff(ax0_blk_indexer) == 1).all()))
|
||
|
|
||
|
# Omit indexer if no item reindexing is required.
|
||
|
if unit_no_ax0_reindexing:
|
||
|
join_unit_indexers.pop(0, None)
|
||
|
else:
|
||
|
join_unit_indexers[0] = ax0_blk_indexer
|
||
|
|
||
|
unit = JoinUnit(blk, shape, join_unit_indexers)
|
||
|
|
||
|
plan.append((placements, unit))
|
||
|
|
||
|
return plan
|
||
|
|
||
|
|
||
|
def combine_concat_plans(plans, concat_axis):
|
||
|
"""
|
||
|
Combine multiple concatenation plans into one.
|
||
|
|
||
|
existing_plan is updated in-place.
|
||
|
"""
|
||
|
if len(plans) == 1:
|
||
|
for p in plans[0]:
|
||
|
yield p[0], [p[1]]
|
||
|
|
||
|
elif concat_axis == 0:
|
||
|
offset = 0
|
||
|
for plan in plans:
|
||
|
last_plc = None
|
||
|
|
||
|
for plc, unit in plan:
|
||
|
yield plc.add(offset), [unit]
|
||
|
last_plc = plc
|
||
|
|
||
|
if last_plc is not None:
|
||
|
offset += last_plc.as_slice.stop
|
||
|
|
||
|
else:
|
||
|
num_ended = [0]
|
||
|
|
||
|
def _next_or_none(seq):
|
||
|
retval = next(seq, None)
|
||
|
if retval is None:
|
||
|
num_ended[0] += 1
|
||
|
return retval
|
||
|
|
||
|
plans = list(map(iter, plans))
|
||
|
next_items = list(map(_next_or_none, plans))
|
||
|
|
||
|
while num_ended[0] != len(next_items):
|
||
|
if num_ended[0] > 0:
|
||
|
raise ValueError("Plan shapes are not aligned")
|
||
|
|
||
|
placements, units = zip(*next_items)
|
||
|
|
||
|
lengths = list(map(len, placements))
|
||
|
min_len, max_len = min(lengths), max(lengths)
|
||
|
|
||
|
if min_len == max_len:
|
||
|
yield placements[0], units
|
||
|
next_items[:] = map(_next_or_none, plans)
|
||
|
else:
|
||
|
yielded_placement = None
|
||
|
yielded_units = [None] * len(next_items)
|
||
|
for i, (plc, unit) in enumerate(next_items):
|
||
|
yielded_units[i] = unit
|
||
|
if len(plc) > min_len:
|
||
|
# trim_join_unit updates unit in place, so only
|
||
|
# placement needs to be sliced to skip min_len.
|
||
|
next_items[i] = (plc[min_len:],
|
||
|
trim_join_unit(unit, min_len))
|
||
|
else:
|
||
|
yielded_placement = plc
|
||
|
next_items[i] = _next_or_none(plans[i])
|
||
|
|
||
|
yield yielded_placement, yielded_units
|
||
|
|
||
|
|
||
|
def trim_join_unit(join_unit, length):
|
||
|
"""
|
||
|
Reduce join_unit's shape along item axis to length.
|
||
|
|
||
|
Extra items that didn't fit are returned as a separate block.
|
||
|
"""
|
||
|
|
||
|
if 0 not in join_unit.indexers:
|
||
|
extra_indexers = join_unit.indexers
|
||
|
|
||
|
if join_unit.block is None:
|
||
|
extra_block = None
|
||
|
else:
|
||
|
extra_block = join_unit.block.getitem_block(slice(length, None))
|
||
|
join_unit.block = join_unit.block.getitem_block(slice(length))
|
||
|
else:
|
||
|
extra_block = join_unit.block
|
||
|
|
||
|
extra_indexers = copy.copy(join_unit.indexers)
|
||
|
extra_indexers[0] = extra_indexers[0][length:]
|
||
|
join_unit.indexers[0] = join_unit.indexers[0][:length]
|
||
|
|
||
|
extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:]
|
||
|
join_unit.shape = (length,) + join_unit.shape[1:]
|
||
|
|
||
|
return JoinUnit(block=extra_block, indexers=extra_indexers,
|
||
|
shape=extra_shape)
|
||
|
|
||
|
|
||
|
class JoinUnit(object):
|
||
|
|
||
|
def __init__(self, block, shape, indexers=None):
|
||
|
# Passing shape explicitly is required for cases when block is None.
|
||
|
if indexers is None:
|
||
|
indexers = {}
|
||
|
self.block = block
|
||
|
self.indexers = indexers
|
||
|
self.shape = shape
|
||
|
|
||
|
def __repr__(self):
|
||
|
return '{name}({block!r}, {indexers})'.format(
|
||
|
name=self.__class__.__name__, block=self.block,
|
||
|
indexers=self.indexers)
|
||
|
|
||
|
@cache_readonly
|
||
|
def needs_filling(self):
|
||
|
for indexer in self.indexers.values():
|
||
|
# FIXME: cache results of indexer == -1 checks.
|
||
|
if (indexer == -1).any():
|
||
|
return True
|
||
|
|
||
|
return False
|
||
|
|
||
|
@cache_readonly
|
||
|
def dtype(self):
|
||
|
if self.block is None:
|
||
|
raise AssertionError("Block is None, no dtype")
|
||
|
|
||
|
if not self.needs_filling:
|
||
|
return self.block.dtype
|
||
|
else:
|
||
|
return _get_dtype(maybe_promote(self.block.dtype,
|
||
|
self.block.fill_value)[0])
|
||
|
|
||
|
@cache_readonly
|
||
|
def is_na(self):
|
||
|
if self.block is None:
|
||
|
return True
|
||
|
|
||
|
if not self.block._can_hold_na:
|
||
|
return False
|
||
|
|
||
|
# Usually it's enough to check but a small fraction of values to see if
|
||
|
# a block is NOT null, chunks should help in such cases. 1000 value
|
||
|
# was chosen rather arbitrarily.
|
||
|
values = self.block.values
|
||
|
if self.block.is_categorical:
|
||
|
values_flat = values.categories
|
||
|
elif self.block.is_sparse:
|
||
|
# fill_value is not NaN and have holes
|
||
|
if not values._null_fill_value and values.sp_index.ngaps > 0:
|
||
|
return False
|
||
|
values_flat = values.ravel(order='K')
|
||
|
elif isinstance(self.block, ExtensionBlock):
|
||
|
values_flat = values
|
||
|
else:
|
||
|
values_flat = values.ravel(order='K')
|
||
|
total_len = values_flat.shape[0]
|
||
|
chunk_len = max(total_len // 40, 1000)
|
||
|
for i in range(0, total_len, chunk_len):
|
||
|
if not isna(values_flat[i:i + chunk_len]).all():
|
||
|
return False
|
||
|
|
||
|
return True
|
||
|
|
||
|
def get_reindexed_values(self, empty_dtype, upcasted_na):
|
||
|
if upcasted_na is None:
|
||
|
# No upcasting is necessary
|
||
|
fill_value = self.block.fill_value
|
||
|
values = self.block.get_values()
|
||
|
else:
|
||
|
fill_value = upcasted_na
|
||
|
|
||
|
if self.is_na:
|
||
|
if getattr(self.block, 'is_object', False):
|
||
|
# we want to avoid filling with np.nan if we are
|
||
|
# using None; we already know that we are all
|
||
|
# nulls
|
||
|
values = self.block.values.ravel(order='K')
|
||
|
if len(values) and values[0] is None:
|
||
|
fill_value = None
|
||
|
|
||
|
if getattr(self.block, 'is_datetimetz', False) or \
|
||
|
is_datetimetz(empty_dtype):
|
||
|
pass
|
||
|
elif getattr(self.block, 'is_categorical', False):
|
||
|
pass
|
||
|
elif getattr(self.block, 'is_sparse', False):
|
||
|
pass
|
||
|
else:
|
||
|
missing_arr = np.empty(self.shape, dtype=empty_dtype)
|
||
|
missing_arr.fill(fill_value)
|
||
|
return missing_arr
|
||
|
|
||
|
if not self.indexers:
|
||
|
if not self.block._can_consolidate:
|
||
|
# preserve these for validation in _concat_compat
|
||
|
return self.block.values
|
||
|
|
||
|
if self.block.is_bool and not self.block.is_categorical:
|
||
|
# External code requested filling/upcasting, bool values must
|
||
|
# be upcasted to object to avoid being upcasted to numeric.
|
||
|
values = self.block.astype(np.object_).values
|
||
|
elif self.block.is_extension:
|
||
|
values = self.block.values
|
||
|
else:
|
||
|
# No dtype upcasting is done here, it will be performed during
|
||
|
# concatenation itself.
|
||
|
values = self.block.get_values()
|
||
|
|
||
|
if not self.indexers:
|
||
|
# If there's no indexing to be done, we want to signal outside
|
||
|
# code that this array must be copied explicitly. This is done
|
||
|
# by returning a view and checking `retval.base`.
|
||
|
values = values.view()
|
||
|
|
||
|
else:
|
||
|
for ax, indexer in self.indexers.items():
|
||
|
values = algos.take_nd(values, indexer, axis=ax,
|
||
|
fill_value=fill_value)
|
||
|
|
||
|
return values
|
||
|
|
||
|
|
||
|
def _fast_count_smallints(arr):
|
||
|
"""Faster version of set(arr) for sequences of small numbers."""
|
||
|
if len(arr) == 0:
|
||
|
# Handle empty arr case separately: numpy 1.6 chokes on that.
|
||
|
return np.empty((0, 2), dtype=arr.dtype)
|
||
|
else:
|
||
|
counts = np.bincount(arr.astype(np.int_))
|
||
|
nz = counts.nonzero()[0]
|
||
|
return np.c_[nz, counts[nz]]
|
||
|
|
||
|
|
||
|
def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill):
|
||
|
if isinstance(slice_or_indexer, slice):
|
||
|
return ('slice', slice_or_indexer,
|
||
|
libinternals.slice_len(slice_or_indexer, length))
|
||
|
elif (isinstance(slice_or_indexer, np.ndarray) and
|
||
|
slice_or_indexer.dtype == np.bool_):
|
||
|
return 'mask', slice_or_indexer, slice_or_indexer.sum()
|
||
|
else:
|
||
|
indexer = np.asanyarray(slice_or_indexer, dtype=np.int64)
|
||
|
if not allow_fill:
|
||
|
indexer = maybe_convert_indices(indexer, length)
|
||
|
return 'fancy', indexer, len(indexer)
|