laywerrobot/lib/python3.6/site-packages/pandas/core/internals.py

5903 lines
195 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
import warnings
import copy
from warnings import catch_warnings
import inspect
import itertools
import re
import operator
from datetime import datetime, timedelta, date
from collections import defaultdict
from functools import partial
import numpy as np
from pandas._libs import internals as libinternals
from pandas.core.base import PandasObject
from pandas.core.dtypes.dtypes import (
ExtensionDtype, DatetimeTZDtype,
PandasExtensionDtype,
CategoricalDtype)
from pandas.core.dtypes.common import (
_TD_DTYPE, _NS_DTYPE,
_ensure_int64, _ensure_platform_int,
is_integer,
is_dtype_equal,
is_timedelta64_dtype,
is_datetime64_dtype, is_datetimetz, is_sparse,
is_categorical, is_categorical_dtype,
is_integer_dtype,
is_datetime64tz_dtype,
is_bool_dtype,
is_object_dtype,
is_datetimelike_v_numeric,
is_float_dtype, is_numeric_dtype,
is_numeric_v_string_like, is_extension_type,
is_extension_array_dtype,
is_list_like,
is_re,
is_re_compilable,
is_scalar,
_get_dtype)
from pandas.core.dtypes.cast import (
maybe_downcast_to_dtype,
maybe_upcast,
maybe_promote,
infer_dtype_from,
infer_dtype_from_scalar,
soft_convert_objects,
maybe_convert_objects,
astype_nansafe,
find_common_type,
maybe_infer_dtype_type)
from pandas.core.dtypes.missing import (
isna, notna, array_equivalent,
_isna_compat,
is_null_datelike_scalar)
import pandas.core.dtypes.concat as _concat
from pandas.core.dtypes.generic import (
ABCSeries,
ABCDatetimeIndex,
ABCExtensionArray,
ABCIndexClass)
import pandas.core.common as com
import pandas.core.algorithms as algos
from pandas.core.index import Index, MultiIndex, _ensure_index
from pandas.core.indexing import maybe_convert_indices, check_setitem_lengths
from pandas.core.arrays import Categorical
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.timedeltas import TimedeltaIndex
from pandas.io.formats.printing import pprint_thing
import pandas.core.missing as missing
from pandas.core.sparse.array import _maybe_to_sparse, SparseArray
from pandas._libs import lib, tslib
from pandas._libs.tslib import Timedelta
from pandas._libs.internals import BlockPlacement
from pandas._libs.tslibs import conversion
from pandas.util._decorators import cache_readonly
from pandas.util._validators import validate_bool_kwarg
from pandas import compat
from pandas.compat import range, map, zip, u
class Block(PandasObject):
"""
Canonical n-dimensional unit of homogeneous dtype contained in a pandas
data structure
Index-ignorant; let the container take care of that
"""
__slots__ = ['_mgr_locs', 'values', 'ndim']
is_numeric = False
is_float = False
is_integer = False
is_complex = False
is_datetime = False
is_datetimetz = False
is_timedelta = False
is_bool = False
is_object = False
is_categorical = False
is_sparse = False
is_extension = False
_box_to_block_values = True
_can_hold_na = False
_can_consolidate = True
_verify_integrity = True
_validate_ndim = True
_ftype = 'dense'
_concatenator = staticmethod(np.concatenate)
def __init__(self, values, placement, ndim=None):
self.ndim = self._check_ndim(values, ndim)
self.mgr_locs = placement
self.values = values
if (self._validate_ndim and self.ndim and
len(self.mgr_locs) != len(self.values)):
raise ValueError(
'Wrong number of items passed {val}, placement implies '
'{mgr}'.format(val=len(self.values), mgr=len(self.mgr_locs)))
def _check_ndim(self, values, ndim):
"""ndim inference and validation.
Infers ndim from 'values' if not provided to __init__.
Validates that values.ndim and ndim are consistent if and only if
the class variable '_validate_ndim' is True.
Parameters
----------
values : array-like
ndim : int or None
Returns
-------
ndim : int
Raises
------
ValueError : the number of dimensions do not match
"""
if ndim is None:
ndim = values.ndim
if self._validate_ndim and values.ndim != ndim:
msg = ("Wrong number of dimensions. values.ndim != ndim "
"[{} != {}]")
raise ValueError(msg.format(values.ndim, ndim))
return ndim
@property
def _holder(self):
"""The array-like that can hold the underlying values.
None for 'Block', overridden by subclasses that don't
use an ndarray.
"""
return None
@property
def _consolidate_key(self):
return (self._can_consolidate, self.dtype.name)
@property
def _is_single_block(self):
return self.ndim == 1
@property
def is_view(self):
""" return a boolean if I am possibly a view """
return self.values.base is not None
@property
def is_datelike(self):
""" return True if I am a non-datelike """
return self.is_datetime or self.is_timedelta
def is_categorical_astype(self, dtype):
"""
validate that we have a astypeable to categorical,
returns a boolean if we are a categorical
"""
if dtype is Categorical or dtype is CategoricalDtype:
# this is a pd.Categorical, but is not
# a valid type for astypeing
raise TypeError("invalid type {0} for astype".format(dtype))
elif is_categorical_dtype(dtype):
return True
return False
def external_values(self, dtype=None):
""" return an outside world format, currently just the ndarray """
return self.values
def internal_values(self, dtype=None):
""" return an internal format, currently just the ndarray
this should be the pure internal API format
"""
return self.values
def formatting_values(self):
"""Return the internal values used by the DataFrame/SeriesFormatter"""
return self.internal_values()
def get_values(self, dtype=None):
"""
return an internal format, currently just the ndarray
this is often overridden to handle to_dense like operations
"""
if is_object_dtype(dtype):
return self.values.astype(object)
return self.values
def to_dense(self):
return self.values.view()
@property
def _na_value(self):
return np.nan
@property
def fill_value(self):
return np.nan
@property
def mgr_locs(self):
return self._mgr_locs
@mgr_locs.setter
def mgr_locs(self, new_mgr_locs):
if not isinstance(new_mgr_locs, BlockPlacement):
new_mgr_locs = BlockPlacement(new_mgr_locs)
self._mgr_locs = new_mgr_locs
@property
def array_dtype(self):
""" the dtype to return if I want to construct this block as an
array
"""
return self.dtype
def make_block(self, values, placement=None, ndim=None):
"""
Create a new block, with type inference propagate any values that are
not specified
"""
if placement is None:
placement = self.mgr_locs
if ndim is None:
ndim = self.ndim
return make_block(values, placement=placement, ndim=ndim)
def make_block_scalar(self, values):
"""
Create a ScalarBlock
"""
return ScalarBlock(values)
def make_block_same_class(self, values, placement=None, ndim=None,
dtype=None):
""" Wrap given values in a block of same type as self. """
if dtype is not None:
# issue 19431 fastparquet is passing this
warnings.warn("dtype argument is deprecated, will be removed "
"in a future release.", DeprecationWarning)
if placement is None:
placement = self.mgr_locs
return make_block(values, placement=placement, ndim=ndim,
klass=self.__class__, dtype=dtype)
def __unicode__(self):
# don't want to print out all of the items here
name = pprint_thing(self.__class__.__name__)
if self._is_single_block:
result = '{name}: {len} dtype: {dtype}'.format(
name=name, len=len(self), dtype=self.dtype)
else:
shape = ' x '.join(pprint_thing(s) for s in self.shape)
result = '{name}: {index}, {shape}, dtype: {dtype}'.format(
name=name, index=pprint_thing(self.mgr_locs.indexer),
shape=shape, dtype=self.dtype)
return result
def __len__(self):
return len(self.values)
def __getstate__(self):
return self.mgr_locs.indexer, self.values
def __setstate__(self, state):
self.mgr_locs = BlockPlacement(state[0])
self.values = state[1]
self.ndim = self.values.ndim
def _slice(self, slicer):
""" return a slice of my values """
return self.values[slicer]
def reshape_nd(self, labels, shape, ref_items, mgr=None):
"""
Parameters
----------
labels : list of new axis labels
shape : new shape
ref_items : new ref_items
return a new block that is transformed to a nd block
"""
return _block2d_to_blocknd(values=self.get_values().T,
placement=self.mgr_locs, shape=shape,
labels=labels, ref_items=ref_items)
def getitem_block(self, slicer, new_mgr_locs=None):
"""
Perform __getitem__-like, return result as block.
As of now, only supports slices that preserve dimensionality.
"""
if new_mgr_locs is None:
if isinstance(slicer, tuple):
axis0_slicer = slicer[0]
else:
axis0_slicer = slicer
new_mgr_locs = self.mgr_locs[axis0_slicer]
new_values = self._slice(slicer)
if self._validate_ndim and new_values.ndim != self.ndim:
raise ValueError("Only same dim slicing is allowed")
return self.make_block_same_class(new_values, new_mgr_locs)
@property
def shape(self):
return self.values.shape
@property
def dtype(self):
return self.values.dtype
@property
def ftype(self):
return "{dtype}:{ftype}".format(dtype=self.dtype, ftype=self._ftype)
def merge(self, other):
return _merge_blocks([self, other])
def concat_same_type(self, to_concat, placement=None):
"""
Concatenate list of single blocks of the same type.
"""
values = self._concatenator([blk.values for blk in to_concat],
axis=self.ndim - 1)
return self.make_block_same_class(
values, placement=placement or slice(0, len(values), 1))
def iget(self, i):
return self.values[i]
def set(self, locs, values, check=False):
"""
Modify Block in-place with new item value
Returns
-------
None
"""
self.values[locs] = values
def delete(self, loc):
"""
Delete given loc(-s) from block in-place.
"""
self.values = np.delete(self.values, loc, 0)
self.mgr_locs = self.mgr_locs.delete(loc)
def apply(self, func, mgr=None, **kwargs):
""" apply the function to my values; return a block if we are not
one
"""
with np.errstate(all='ignore'):
result = func(self.values, **kwargs)
if not isinstance(result, Block):
result = self.make_block(values=_block_shape(result,
ndim=self.ndim))
return result
def fillna(self, value, limit=None, inplace=False, downcast=None,
mgr=None):
""" fillna on the block with the value. If we fail, then convert to
ObjectBlock and try again
"""
inplace = validate_bool_kwarg(inplace, 'inplace')
if not self._can_hold_na:
if inplace:
return self
else:
return self.copy()
mask = isna(self.values)
if limit is not None:
if not is_integer(limit):
raise ValueError('Limit must be an integer')
if limit < 1:
raise ValueError('Limit must be greater than 0')
if self.ndim > 2:
raise NotImplementedError("number of dimensions for 'fillna' "
"is currently limited to 2")
mask[mask.cumsum(self.ndim - 1) > limit] = False
# fillna, but if we cannot coerce, then try again as an ObjectBlock
try:
values, _, _, _ = self._try_coerce_args(self.values, value)
blocks = self.putmask(mask, value, inplace=inplace)
blocks = [b.make_block(values=self._try_coerce_result(b.values))
for b in blocks]
return self._maybe_downcast(blocks, downcast)
except (TypeError, ValueError):
# we can't process the value, but nothing to do
if not mask.any():
return self if inplace else self.copy()
# operate column-by-column
def f(m, v, i):
block = self.coerce_to_target_dtype(value)
# slice out our block
if i is not None:
block = block.getitem_block(slice(i, i + 1))
return block.fillna(value,
limit=limit,
inplace=inplace,
downcast=None)
return self.split_and_operate(mask, f, inplace)
def split_and_operate(self, mask, f, inplace):
"""
split the block per-column, and apply the callable f
per-column, return a new block for each. Handle
masking which will not change a block unless needed.
Parameters
----------
mask : 2-d boolean mask
f : callable accepting (1d-mask, 1d values, indexer)
inplace : boolean
Returns
-------
list of blocks
"""
if mask is None:
mask = np.ones(self.shape, dtype=bool)
new_values = self.values
def make_a_block(nv, ref_loc):
if isinstance(nv, Block):
block = nv
elif isinstance(nv, list):
block = nv[0]
else:
# Put back the dimension that was taken from it and make
# a block out of the result.
try:
nv = _block_shape(nv, ndim=self.ndim)
except (AttributeError, NotImplementedError):
pass
block = self.make_block(values=nv,
placement=ref_loc)
return block
# ndim == 1
if self.ndim == 1:
if mask.any():
nv = f(mask, new_values, None)
else:
nv = new_values if inplace else new_values.copy()
block = make_a_block(nv, self.mgr_locs)
return [block]
# ndim > 1
new_blocks = []
for i, ref_loc in enumerate(self.mgr_locs):
m = mask[i]
v = new_values[i]
# need a new block
if m.any():
nv = f(m, v, i)
else:
nv = v if inplace else v.copy()
block = make_a_block(nv, [ref_loc])
new_blocks.append(block)
return new_blocks
def _maybe_downcast(self, blocks, downcast=None):
# no need to downcast our float
# unless indicated
if downcast is None and self.is_float:
return blocks
elif downcast is None and (self.is_timedelta or self.is_datetime):
return blocks
if not isinstance(blocks, list):
blocks = [blocks]
return _extend_blocks([b.downcast(downcast) for b in blocks])
def downcast(self, dtypes=None, mgr=None):
""" try to downcast each item to the dict of dtypes if present """
# turn it off completely
if dtypes is False:
return self
values = self.values
# single block handling
if self._is_single_block:
# try to cast all non-floats here
if dtypes is None:
dtypes = 'infer'
nv = maybe_downcast_to_dtype(values, dtypes)
return self.make_block(nv)
# ndim > 1
if dtypes is None:
return self
if not (dtypes == 'infer' or isinstance(dtypes, dict)):
raise ValueError("downcast must have a dictionary or 'infer' as "
"its argument")
# operate column-by-column
# this is expensive as it splits the blocks items-by-item
def f(m, v, i):
if dtypes == 'infer':
dtype = 'infer'
else:
raise AssertionError("dtypes as dict is not supported yet")
if dtype is not None:
v = maybe_downcast_to_dtype(v, dtype)
return v
return self.split_and_operate(None, f, False)
def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
return self._astype(dtype, copy=copy, errors=errors, values=values,
**kwargs)
def _astype(self, dtype, copy=False, errors='raise', values=None,
klass=None, mgr=None, **kwargs):
"""Coerce to the new type
Parameters
----------
dtype : str, dtype convertible
copy : boolean, default False
copy if indicated
errors : str, {'raise', 'ignore'}, default 'ignore'
- ``raise`` : allow exceptions to be raised
- ``ignore`` : suppress exceptions. On error return original object
Returns
-------
Block
"""
errors_legal_values = ('raise', 'ignore')
if errors not in errors_legal_values:
invalid_arg = ("Expected value of kwarg 'errors' to be one of {}. "
"Supplied value is '{}'".format(
list(errors_legal_values), errors))
raise ValueError(invalid_arg)
if (inspect.isclass(dtype) and
issubclass(dtype, (PandasExtensionDtype, ExtensionDtype))):
msg = ("Expected an instance of {}, but got the class instead. "
"Try instantiating 'dtype'.".format(dtype.__name__))
raise TypeError(msg)
# may need to convert to categorical
if self.is_categorical_astype(dtype):
# deprecated 17636
if ('categories' in kwargs or 'ordered' in kwargs):
if isinstance(dtype, CategoricalDtype):
raise TypeError(
"Cannot specify a CategoricalDtype and also "
"`categories` or `ordered`. Use "
"`dtype=CategoricalDtype(categories, ordered)`"
" instead.")
warnings.warn("specifying 'categories' or 'ordered' in "
".astype() is deprecated; pass a "
"CategoricalDtype instead",
FutureWarning, stacklevel=7)
categories = kwargs.get('categories', None)
ordered = kwargs.get('ordered', None)
if com._any_not_none(categories, ordered):
dtype = CategoricalDtype(categories, ordered)
if is_categorical_dtype(self.values):
# GH 10696/18593: update an existing categorical efficiently
return self.make_block(self.values.astype(dtype, copy=copy))
return self.make_block(Categorical(self.values, dtype=dtype))
# astype processing
dtype = np.dtype(dtype)
if self.dtype == dtype:
if copy:
return self.copy()
return self
if klass is None:
if dtype == np.object_:
klass = ObjectBlock
try:
# force the copy here
if values is None:
if issubclass(dtype.type,
(compat.text_type, compat.string_types)):
# use native type formatting for datetime/tz/timedelta
if self.is_datelike:
values = self.to_native_types()
# astype formatting
else:
values = self.get_values()
else:
values = self.get_values(dtype=dtype)
# _astype_nansafe works fine with 1-d only
values = astype_nansafe(values.ravel(), dtype, copy=True)
values = values.reshape(self.shape)
newb = make_block(values, placement=self.mgr_locs,
klass=klass)
except:
if errors == 'raise':
raise
newb = self.copy() if copy else self
if newb.is_numeric and self.is_numeric:
if newb.shape != self.shape:
raise TypeError(
"cannot set astype for copy = [{copy}] for dtype "
"({dtype} [{itemsize}]) with smaller itemsize than "
"current ({newb_dtype} [{newb_size}])".format(
copy=copy, dtype=self.dtype.name,
itemsize=self.itemsize, newb_dtype=newb.dtype.name,
newb_size=newb.itemsize))
return newb
def convert(self, copy=True, **kwargs):
""" attempt to coerce any object types to better types return a copy
of the block (if copy = True) by definition we are not an ObjectBlock
here!
"""
return self.copy() if copy else self
def _can_hold_element(self, element):
""" require the same dtype as ourselves """
dtype = self.values.dtype.type
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
return issubclass(tipo.type, dtype)
return isinstance(element, dtype)
def _try_cast_result(self, result, dtype=None):
""" try to cast the result to our original type, we may have
roundtripped thru object in the mean-time
"""
if dtype is None:
dtype = self.dtype
if self.is_integer or self.is_bool or self.is_datetime:
pass
elif self.is_float and result.dtype == self.dtype:
# protect against a bool/object showing up here
if isinstance(dtype, compat.string_types) and dtype == 'infer':
return result
if not isinstance(dtype, type):
dtype = dtype.type
if issubclass(dtype, (np.bool_, np.object_)):
if issubclass(dtype, np.bool_):
if isna(result).all():
return result.astype(np.bool_)
else:
result = result.astype(np.object_)
result[result == 1] = True
result[result == 0] = False
return result
else:
return result.astype(np.object_)
return result
# may need to change the dtype here
return maybe_downcast_to_dtype(result, dtype)
def _try_coerce_args(self, values, other):
""" provide coercion to our input arguments """
if np.any(notna(other)) and not self._can_hold_element(other):
# coercion issues
# let higher levels handle
raise TypeError("cannot convert {} to an {}".format(
type(other).__name__,
type(self).__name__.lower().replace('Block', '')))
return values, False, other, False
def _try_coerce_result(self, result):
""" reverse of try_coerce_args """
return result
def _try_coerce_and_cast_result(self, result, dtype=None):
result = self._try_coerce_result(result)
result = self._try_cast_result(result, dtype=dtype)
return result
def to_native_types(self, slicer=None, na_rep='nan', quoting=None,
**kwargs):
""" convert to our native types format, slicing if desired """
values = self.get_values()
if slicer is not None:
values = values[:, slicer]
mask = isna(values)
if not self.is_object and not quoting:
values = values.astype(str)
else:
values = np.array(values, dtype='object')
values[mask] = na_rep
return values
# block actions ####
def copy(self, deep=True, mgr=None):
""" copy constructor """
values = self.values
if deep:
values = values.copy()
return self.make_block_same_class(values)
def replace(self, to_replace, value, inplace=False, filter=None,
regex=False, convert=True, mgr=None):
""" replace the to_replace value with value, possible to create new
blocks here this is just a call to putmask. regex is not used here.
It is used in ObjectBlocks. It is here for API
compatibility.
"""
inplace = validate_bool_kwarg(inplace, 'inplace')
original_to_replace = to_replace
# try to replace, if we raise an error, convert to ObjectBlock and
# retry
try:
values, _, to_replace, _ = self._try_coerce_args(self.values,
to_replace)
mask = missing.mask_missing(values, to_replace)
if filter is not None:
filtered_out = ~self.mgr_locs.isin(filter)
mask[filtered_out.nonzero()[0]] = False
blocks = self.putmask(mask, value, inplace=inplace)
if convert:
blocks = [b.convert(by_item=True, numeric=False,
copy=not inplace) for b in blocks]
return blocks
except (TypeError, ValueError):
# try again with a compatible block
block = self.astype(object)
return block.replace(
to_replace=original_to_replace, value=value, inplace=inplace,
filter=filter, regex=regex, convert=convert)
def _replace_single(self, *args, **kwargs):
""" no-op on a non-ObjectBlock """
return self if kwargs['inplace'] else self.copy()
def setitem(self, indexer, value, mgr=None):
"""Set the value inplace, returning a a maybe different typed block.
Parameters
----------
indexer : tuple, list-like, array-like, slice
The subset of self.values to set
value : object
The value being set
mgr : BlockPlacement, optional
Returns
-------
Block
Notes
-----
`indexer` is a direct slice/positional indexer. `value` must
be a compatible shape.
"""
# coerce None values, if appropriate
if value is None:
if self.is_numeric:
value = np.nan
# coerce if block dtype can store value
values = self.values
try:
values, _, value, _ = self._try_coerce_args(values, value)
# can keep its own dtype
if hasattr(value, 'dtype') and is_dtype_equal(values.dtype,
value.dtype):
dtype = self.dtype
else:
dtype = 'infer'
except (TypeError, ValueError):
# current dtype cannot store value, coerce to common dtype
find_dtype = False
if hasattr(value, 'dtype'):
dtype = value.dtype
find_dtype = True
elif is_scalar(value):
if isna(value):
# NaN promotion is handled in latter path
dtype = False
else:
dtype, _ = infer_dtype_from_scalar(value,
pandas_dtype=True)
find_dtype = True
else:
dtype = 'infer'
if find_dtype:
dtype = find_common_type([values.dtype, dtype])
if not is_dtype_equal(self.dtype, dtype):
b = self.astype(dtype)
return b.setitem(indexer, value, mgr=mgr)
# value must be storeable at this moment
arr_value = np.array(value)
# cast the values to a type that can hold nan (if necessary)
if not self._can_hold_element(value):
dtype, _ = maybe_promote(arr_value.dtype)
values = values.astype(dtype)
transf = (lambda x: x.T) if self.ndim == 2 else (lambda x: x)
values = transf(values)
# length checking
check_setitem_lengths(indexer, value, values)
def _is_scalar_indexer(indexer):
# return True if we are all scalar indexers
if arr_value.ndim == 1:
if not isinstance(indexer, tuple):
indexer = tuple([indexer])
return any(isinstance(idx, np.ndarray) and len(idx) == 0
for idx in indexer)
return False
def _is_empty_indexer(indexer):
# return a boolean if we have an empty indexer
if is_list_like(indexer) and not len(indexer):
return True
if arr_value.ndim == 1:
if not isinstance(indexer, tuple):
indexer = tuple([indexer])
return any(isinstance(idx, np.ndarray) and len(idx) == 0
for idx in indexer)
return False
# empty indexers
# 8669 (empty)
if _is_empty_indexer(indexer):
pass
# setting a single element for each dim and with a rhs that could
# be say a list
# GH 6043
elif _is_scalar_indexer(indexer):
values[indexer] = value
# if we are an exact match (ex-broadcasting),
# then use the resultant dtype
elif (len(arr_value.shape) and
arr_value.shape[0] == values.shape[0] and
np.prod(arr_value.shape) == np.prod(values.shape)):
values[indexer] = value
try:
values = values.astype(arr_value.dtype)
except ValueError:
pass
# set
else:
values[indexer] = value
# coerce and try to infer the dtypes of the result
values = self._try_coerce_and_cast_result(values, dtype)
block = self.make_block(transf(values))
return block
def putmask(self, mask, new, align=True, inplace=False, axis=0,
transpose=False, mgr=None):
""" putmask the data to the block; it is possible that we may create a
new dtype of block
return the resulting block(s)
Parameters
----------
mask : the condition to respect
new : a ndarray/object
align : boolean, perform alignment on other/cond, default is True
inplace : perform inplace modification, default is False
axis : int
transpose : boolean
Set to True if self is stored with axes reversed
Returns
-------
a list of new blocks, the result of the putmask
"""
new_values = self.values if inplace else self.values.copy()
new = getattr(new, 'values', new)
mask = getattr(mask, 'values', mask)
# if we are passed a scalar None, convert it here
if not is_list_like(new) and isna(new) and not self.is_object:
new = self.fill_value
if self._can_hold_element(new):
_, _, new, _ = self._try_coerce_args(new_values, new)
if transpose:
new_values = new_values.T
# If the default repeat behavior in np.putmask would go in the
# wrong direction, then explicitly repeat and reshape new instead
if getattr(new, 'ndim', 0) >= 1:
if self.ndim - 1 == new.ndim and axis == 1:
new = np.repeat(
new, new_values.shape[-1]).reshape(self.shape)
new = new.astype(new_values.dtype)
# we require exact matches between the len of the
# values we are setting (or is compat). np.putmask
# doesn't check this and will simply truncate / pad
# the output, but we want sane error messages
#
# TODO: this prob needs some better checking
# for 2D cases
if ((is_list_like(new) and
np.any(mask[mask]) and
getattr(new, 'ndim', 1) == 1)):
if not (mask.shape[-1] == len(new) or
mask[mask].shape[-1] == len(new) or
len(new) == 1):
raise ValueError("cannot assign mismatch "
"length to masked array")
np.putmask(new_values, mask, new)
# maybe upcast me
elif mask.any():
if transpose:
mask = mask.T
if isinstance(new, np.ndarray):
new = new.T
axis = new_values.ndim - axis - 1
# Pseudo-broadcast
if getattr(new, 'ndim', 0) >= 1:
if self.ndim - 1 == new.ndim:
new_shape = list(new.shape)
new_shape.insert(axis, 1)
new = new.reshape(tuple(new_shape))
# operate column-by-column
def f(m, v, i):
if i is None:
# ndim==1 case.
n = new
else:
if isinstance(new, np.ndarray):
n = np.squeeze(new[i % new.shape[0]])
else:
n = np.array(new)
# type of the new block
dtype, _ = maybe_promote(n.dtype)
# we need to explicitly astype here to make a copy
n = n.astype(dtype)
nv = _putmask_smart(v, m, n)
return nv
new_blocks = self.split_and_operate(mask, f, inplace)
return new_blocks
if inplace:
return [self]
if transpose:
new_values = new_values.T
return [self.make_block(new_values)]
def coerce_to_target_dtype(self, other):
"""
coerce the current block to a dtype compat for other
we will return a block, possibly object, and not raise
we can also safely try to coerce to the same dtype
and will receive the same block
"""
# if we cannot then coerce to object
dtype, _ = infer_dtype_from(other, pandas_dtype=True)
if is_dtype_equal(self.dtype, dtype):
return self
if self.is_bool or is_object_dtype(dtype) or is_bool_dtype(dtype):
# we don't upcast to bool
return self.astype(object)
elif ((self.is_float or self.is_complex) and
(is_integer_dtype(dtype) or is_float_dtype(dtype))):
# don't coerce float/complex to int
return self
elif (self.is_datetime or
is_datetime64_dtype(dtype) or
is_datetime64tz_dtype(dtype)):
# not a datetime
if not ((is_datetime64_dtype(dtype) or
is_datetime64tz_dtype(dtype)) and self.is_datetime):
return self.astype(object)
# don't upcast timezone with different timezone or no timezone
mytz = getattr(self.dtype, 'tz', None)
othertz = getattr(dtype, 'tz', None)
if str(mytz) != str(othertz):
return self.astype(object)
raise AssertionError("possible recursion in "
"coerce_to_target_dtype: {} {}".format(
self, other))
elif (self.is_timedelta or is_timedelta64_dtype(dtype)):
# not a timedelta
if not (is_timedelta64_dtype(dtype) and self.is_timedelta):
return self.astype(object)
raise AssertionError("possible recursion in "
"coerce_to_target_dtype: {} {}".format(
self, other))
try:
return self.astype(dtype)
except (ValueError, TypeError):
pass
return self.astype(object)
def interpolate(self, method='pad', axis=0, index=None, values=None,
inplace=False, limit=None, limit_direction='forward',
limit_area=None, fill_value=None, coerce=False,
downcast=None, mgr=None, **kwargs):
inplace = validate_bool_kwarg(inplace, 'inplace')
def check_int_bool(self, inplace):
# Only FloatBlocks will contain NaNs.
# timedelta subclasses IntBlock
if (self.is_bool or self.is_integer) and not self.is_timedelta:
if inplace:
return self
else:
return self.copy()
# a fill na type method
try:
m = missing.clean_fill_method(method)
except:
m = None
if m is not None:
r = check_int_bool(self, inplace)
if r is not None:
return r
return self._interpolate_with_fill(method=m, axis=axis,
inplace=inplace, limit=limit,
fill_value=fill_value,
coerce=coerce,
downcast=downcast, mgr=mgr)
# try an interp method
try:
m = missing.clean_interp_method(method, **kwargs)
except:
m = None
if m is not None:
r = check_int_bool(self, inplace)
if r is not None:
return r
return self._interpolate(method=m, index=index, values=values,
axis=axis, limit=limit,
limit_direction=limit_direction,
limit_area=limit_area,
fill_value=fill_value, inplace=inplace,
downcast=downcast, mgr=mgr, **kwargs)
raise ValueError("invalid method '{0}' to interpolate.".format(method))
def _interpolate_with_fill(self, method='pad', axis=0, inplace=False,
limit=None, fill_value=None, coerce=False,
downcast=None, mgr=None):
""" fillna but using the interpolate machinery """
inplace = validate_bool_kwarg(inplace, 'inplace')
# if we are coercing, then don't force the conversion
# if the block can't hold the type
if coerce:
if not self._can_hold_na:
if inplace:
return [self]
else:
return [self.copy()]
values = self.values if inplace else self.values.copy()
values, _, fill_value, _ = self._try_coerce_args(values, fill_value)
values = missing.interpolate_2d(values, method=method, axis=axis,
limit=limit, fill_value=fill_value,
dtype=self.dtype)
values = self._try_coerce_result(values)
blocks = [self.make_block_same_class(values, ndim=self.ndim)]
return self._maybe_downcast(blocks, downcast)
def _interpolate(self, method=None, index=None, values=None,
fill_value=None, axis=0, limit=None,
limit_direction='forward', limit_area=None,
inplace=False, downcast=None, mgr=None, **kwargs):
""" interpolate using scipy wrappers """
inplace = validate_bool_kwarg(inplace, 'inplace')
data = self.values if inplace else self.values.copy()
# only deal with floats
if not self.is_float:
if not self.is_integer:
return self
data = data.astype(np.float64)
if fill_value is None:
fill_value = self.fill_value
if method in ('krogh', 'piecewise_polynomial', 'pchip'):
if not index.is_monotonic:
raise ValueError("{0} interpolation requires that the "
"index be monotonic.".format(method))
# process 1-d slices in the axis direction
def func(x):
# process a 1-d slice, returning it
# should the axis argument be handled below in apply_along_axis?
# i.e. not an arg to missing.interpolate_1d
return missing.interpolate_1d(index, x, method=method, limit=limit,
limit_direction=limit_direction,
limit_area=limit_area,
fill_value=fill_value,
bounds_error=False, **kwargs)
# interp each column independently
interp_values = np.apply_along_axis(func, axis, data)
blocks = [self.make_block_same_class(interp_values)]
return self._maybe_downcast(blocks, downcast)
def take_nd(self, indexer, axis, new_mgr_locs=None, fill_tuple=None):
"""
Take values according to indexer and return them as a block.bb
"""
# algos.take_nd dispatches for DatetimeTZBlock, CategoricalBlock
# so need to preserve types
# sparse is treated like an ndarray, but needs .get_values() shaping
values = self.values
if self.is_sparse:
values = self.get_values()
if fill_tuple is None:
fill_value = self.fill_value
new_values = algos.take_nd(values, indexer, axis=axis,
allow_fill=False)
else:
fill_value = fill_tuple[0]
new_values = algos.take_nd(values, indexer, axis=axis,
allow_fill=True, fill_value=fill_value)
if new_mgr_locs is None:
if axis == 0:
slc = libinternals.indexer_as_slice(indexer)
if slc is not None:
new_mgr_locs = self.mgr_locs[slc]
else:
new_mgr_locs = self.mgr_locs[indexer]
else:
new_mgr_locs = self.mgr_locs
if not is_dtype_equal(new_values.dtype, self.dtype):
return self.make_block(new_values, new_mgr_locs)
else:
return self.make_block_same_class(new_values, new_mgr_locs)
def diff(self, n, axis=1, mgr=None):
""" return block for the diff of the values """
new_values = algos.diff(self.values, n, axis=axis)
return [self.make_block(values=new_values)]
def shift(self, periods, axis=0, mgr=None):
""" shift the block by periods, possibly upcast """
# convert integer to float if necessary. need to do a lot more than
# that, handle boolean etc also
new_values, fill_value = maybe_upcast(self.values)
# make sure array sent to np.roll is c_contiguous
f_ordered = new_values.flags.f_contiguous
if f_ordered:
new_values = new_values.T
axis = new_values.ndim - axis - 1
if np.prod(new_values.shape):
new_values = np.roll(new_values, _ensure_platform_int(periods),
axis=axis)
axis_indexer = [slice(None)] * self.ndim
if periods > 0:
axis_indexer[axis] = slice(None, periods)
else:
axis_indexer[axis] = slice(periods, None)
new_values[tuple(axis_indexer)] = fill_value
# restore original order
if f_ordered:
new_values = new_values.T
return [self.make_block(new_values)]
def eval(self, func, other, errors='raise', try_cast=False, mgr=None):
"""
evaluate the block; return result block from the result
Parameters
----------
func : how to combine self, other
other : a ndarray/object
errors : str, {'raise', 'ignore'}, default 'raise'
- ``raise`` : allow exceptions to be raised
- ``ignore`` : suppress exceptions. On error return original object
try_cast : try casting the results to the input type
Returns
-------
a new block, the result of the func
"""
orig_other = other
values = self.values
other = getattr(other, 'values', other)
# make sure that we can broadcast
is_transposed = False
if hasattr(other, 'ndim') and hasattr(values, 'ndim'):
if values.ndim != other.ndim:
is_transposed = True
else:
if values.shape == other.shape[::-1]:
is_transposed = True
elif values.shape[0] == other.shape[-1]:
is_transposed = True
else:
# this is a broadcast error heree
raise ValueError(
"cannot broadcast shape [{t_shape}] with "
"block values [{oth_shape}]".format(
t_shape=values.T.shape, oth_shape=other.shape))
transf = (lambda x: x.T) if is_transposed else (lambda x: x)
# coerce/transpose the args if needed
try:
values, values_mask, other, other_mask = self._try_coerce_args(
transf(values), other)
except TypeError:
block = self.coerce_to_target_dtype(orig_other)
return block.eval(func, orig_other,
errors=errors,
try_cast=try_cast, mgr=mgr)
# get the result, may need to transpose the other
def get_result(other):
# avoid numpy warning of comparisons again None
if other is None:
result = not func.__name__ == 'eq'
# avoid numpy warning of elementwise comparisons to object
elif is_numeric_v_string_like(values, other):
result = False
# avoid numpy warning of elementwise comparisons
elif func.__name__ == 'eq':
if is_list_like(other) and not isinstance(other, np.ndarray):
other = np.asarray(other)
# if we can broadcast, then ok
if values.shape[-1] != other.shape[-1]:
return False
result = func(values, other)
else:
result = func(values, other)
# mask if needed
if isinstance(values_mask, np.ndarray) and values_mask.any():
result = result.astype('float64', copy=False)
result[values_mask] = np.nan
if other_mask is True:
result = result.astype('float64', copy=False)
result[:] = np.nan
elif isinstance(other_mask, np.ndarray) and other_mask.any():
result = result.astype('float64', copy=False)
result[other_mask.ravel()] = np.nan
return result
# error handler if we have an issue operating with the function
def handle_error():
if errors == 'raise':
# The 'detail' variable is defined in outer scope.
raise TypeError(
'Could not operate {other!r} with block values '
'{detail!s}'.format(other=other, detail=detail)) # noqa
else:
# return the values
result = np.empty(values.shape, dtype='O')
result.fill(np.nan)
return result
# get the result
try:
with np.errstate(all='ignore'):
result = get_result(other)
# if we have an invalid shape/broadcast error
# GH4576, so raise instead of allowing to pass through
except ValueError as detail:
raise
except Exception as detail:
result = handle_error()
# technically a broadcast error in numpy can 'work' by returning a
# boolean False
if not isinstance(result, np.ndarray):
if not isinstance(result, np.ndarray):
# differentiate between an invalid ndarray-ndarray comparison
# and an invalid type comparison
if isinstance(values, np.ndarray) and is_list_like(other):
raise ValueError(
'Invalid broadcasting comparison [{other!r}] with '
'block values'.format(other=other))
raise TypeError('Could not compare [{other!r}] '
'with block values'.format(other=other))
# transpose if needed
result = transf(result)
# try to cast if requested
if try_cast:
result = self._try_cast_result(result)
result = _block_shape(result, ndim=self.ndim)
return [self.make_block(result)]
def where(self, other, cond, align=True, errors='raise',
try_cast=False, axis=0, transpose=False, mgr=None):
"""
evaluate the block; return result block(s) from the result
Parameters
----------
other : a ndarray/object
cond : the condition to respect
align : boolean, perform alignment on other/cond
errors : str, {'raise', 'ignore'}, default 'raise'
- ``raise`` : allow exceptions to be raised
- ``ignore`` : suppress exceptions. On error return original object
axis : int
transpose : boolean
Set to True if self is stored with axes reversed
Returns
-------
a new block(s), the result of the func
"""
import pandas.core.computation.expressions as expressions
assert errors in ['raise', 'ignore']
values = self.values
orig_other = other
if transpose:
values = values.T
other = getattr(other, 'values', other)
cond = getattr(cond, 'values', cond)
# If the default broadcasting would go in the wrong direction, then
# explicitly reshape other instead
if getattr(other, 'ndim', 0) >= 1:
if values.ndim - 1 == other.ndim and axis == 1:
other = other.reshape(tuple(other.shape + (1, )))
if not hasattr(cond, 'shape'):
raise ValueError("where must have a condition that is ndarray "
"like")
# our where function
def func(cond, values, other):
if cond.ravel().all():
return values
values, values_mask, other, other_mask = self._try_coerce_args(
values, other)
try:
return self._try_coerce_result(expressions.where(
cond, values, other))
except Exception as detail:
if errors == 'raise':
raise TypeError(
'Could not operate [{other!r}] with block values '
'[{detail!s}]'.format(other=other, detail=detail))
else:
# return the values
result = np.empty(values.shape, dtype='float64')
result.fill(np.nan)
return result
# see if we can operate on the entire block, or need item-by-item
# or if we are a single block (ndim == 1)
try:
result = func(cond, values, other)
except TypeError:
# we cannot coerce, return a compat dtype
# we are explicitly ignoring errors
block = self.coerce_to_target_dtype(other)
blocks = block.where(orig_other, cond, align=align,
errors=errors,
try_cast=try_cast, axis=axis,
transpose=transpose)
return self._maybe_downcast(blocks, 'infer')
if self._can_hold_na or self.ndim == 1:
if transpose:
result = result.T
# try to cast if requested
if try_cast:
result = self._try_cast_result(result)
return self.make_block(result)
# might need to separate out blocks
axis = cond.ndim - 1
cond = cond.swapaxes(axis, 0)
mask = np.array([cond[i].all() for i in range(cond.shape[0])],
dtype=bool)
result_blocks = []
for m in [mask, ~mask]:
if m.any():
r = self._try_cast_result(result.take(m.nonzero()[0],
axis=axis))
result_blocks.append(
self.make_block(r.T, placement=self.mgr_locs[m]))
return result_blocks
def equals(self, other):
if self.dtype != other.dtype or self.shape != other.shape:
return False
return array_equivalent(self.values, other.values)
def _unstack(self, unstacker_func, new_columns):
"""Return a list of unstacked blocks of self
Parameters
----------
unstacker_func : callable
Partially applied unstacker.
new_columns : Index
All columns of the unstacked BlockManager.
Returns
-------
blocks : list of Block
New blocks of unstacked values.
mask : array_like of bool
The mask of columns of `blocks` we should keep.
"""
unstacker = unstacker_func(self.values.T)
new_items = unstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = unstacker.get_new_values()
mask = mask.any(0)
new_values = new_values.T[mask]
new_placement = new_placement[mask]
blocks = [make_block(new_values, placement=new_placement)]
return blocks, mask
def quantile(self, qs, interpolation='linear', axis=0, mgr=None):
"""
compute the quantiles of the
Parameters
----------
qs: a scalar or list of the quantiles to be computed
interpolation: type of interpolation, default 'linear'
axis: axis to compute, default 0
Returns
-------
tuple of (axis, block)
"""
kw = {'interpolation': interpolation}
values = self.get_values()
values, _, _, _ = self._try_coerce_args(values, values)
def _nanpercentile1D(values, mask, q, **kw):
values = values[~mask]
if len(values) == 0:
if is_scalar(q):
return self._na_value
else:
return np.array([self._na_value] * len(q),
dtype=values.dtype)
return np.percentile(values, q, **kw)
def _nanpercentile(values, q, axis, **kw):
mask = isna(self.values)
if not is_scalar(mask) and mask.any():
if self.ndim == 1:
return _nanpercentile1D(values, mask, q, **kw)
else:
# for nonconsolidatable blocks mask is 1D, but values 2D
if mask.ndim < values.ndim:
mask = mask.reshape(values.shape)
if axis == 0:
values = values.T
mask = mask.T
result = [_nanpercentile1D(val, m, q, **kw) for (val, m)
in zip(list(values), list(mask))]
result = np.array(result, dtype=values.dtype, copy=False).T
return result
else:
return np.percentile(values, q, axis=axis, **kw)
from pandas import Float64Index
is_empty = values.shape[axis] == 0
if is_list_like(qs):
ax = Float64Index(qs)
if is_empty:
if self.ndim == 1:
result = self._na_value
else:
# create the array of na_values
# 2d len(values) * len(qs)
result = np.repeat(np.array([self._na_value] * len(qs)),
len(values)).reshape(len(values),
len(qs))
else:
try:
result = _nanpercentile(values, np.array(qs) * 100,
axis=axis, **kw)
except ValueError:
# older numpies don't handle an array for q
result = [_nanpercentile(values, q * 100,
axis=axis, **kw) for q in qs]
result = np.array(result, copy=False)
if self.ndim > 1:
result = result.T
else:
if self.ndim == 1:
ax = Float64Index([qs])
else:
ax = mgr.axes[0]
if is_empty:
if self.ndim == 1:
result = self._na_value
else:
result = np.array([self._na_value] * len(self))
else:
result = _nanpercentile(values, qs * 100, axis=axis, **kw)
ndim = getattr(result, 'ndim', None) or 0
result = self._try_coerce_result(result)
if is_scalar(result):
return ax, self.make_block_scalar(result)
return ax, make_block(result,
placement=np.arange(len(result)),
ndim=ndim)
class ScalarBlock(Block):
"""
a scalar compat Block
"""
__slots__ = ['_mgr_locs', 'values', 'ndim']
def __init__(self, values):
self.ndim = 0
self.mgr_locs = [0]
self.values = values
@property
def dtype(self):
return type(self.values)
@property
def shape(self):
return tuple([0])
def __len__(self):
return 0
class NonConsolidatableMixIn(object):
""" hold methods for the nonconsolidatable blocks """
_can_consolidate = False
_verify_integrity = False
_validate_ndim = False
def __init__(self, values, placement, ndim=None):
"""Initialize a non-consolidatable block.
'ndim' may be inferred from 'placement'.
This will call continue to call __init__ for the other base
classes mixed in with this Mixin.
"""
# Placement must be converted to BlockPlacement so that we can check
# its length
if not isinstance(placement, BlockPlacement):
placement = BlockPlacement(placement)
# Maybe infer ndim from placement
if ndim is None:
if len(placement) != 1:
ndim = 1
else:
ndim = 2
super(NonConsolidatableMixIn, self).__init__(values, placement,
ndim=ndim)
@property
def shape(self):
if self.ndim == 1:
return (len(self.values)),
return (len(self.mgr_locs), len(self.values))
def get_values(self, dtype=None):
""" need to to_dense myself (and always return a ndim sized object) """
values = self.values.to_dense()
if values.ndim == self.ndim - 1:
values = values.reshape((1,) + values.shape)
return values
def iget(self, col):
if self.ndim == 2 and isinstance(col, tuple):
col, loc = col
if not com.is_null_slice(col) and col != 0:
raise IndexError("{0} only contains one item".format(self))
return self.values[loc]
else:
if col != 0:
raise IndexError("{0} only contains one item".format(self))
return self.values
def should_store(self, value):
return isinstance(value, self._holder)
def set(self, locs, values, check=False):
assert locs.tolist() == [0]
self.values = values
def putmask(self, mask, new, align=True, inplace=False, axis=0,
transpose=False, mgr=None):
"""
putmask the data to the block; we must be a single block and not
generate other blocks
return the resulting block
Parameters
----------
mask : the condition to respect
new : a ndarray/object
align : boolean, perform alignment on other/cond, default is True
inplace : perform inplace modification, default is False
Returns
-------
a new block, the result of the putmask
"""
inplace = validate_bool_kwarg(inplace, 'inplace')
# use block's copy logic.
# .values may be an Index which does shallow copy by default
new_values = self.values if inplace else self.copy().values
new_values, _, new, _ = self._try_coerce_args(new_values, new)
if isinstance(new, np.ndarray) and len(new) == len(mask):
new = new[mask]
mask = _safe_reshape(mask, new_values.shape)
new_values[mask] = new
new_values = self._try_coerce_result(new_values)
return [self.make_block(values=new_values)]
def _slice(self, slicer):
""" return a slice of my values (but densify first) """
return self.get_values()[slicer]
def _try_cast_result(self, result, dtype=None):
return result
def _unstack(self, unstacker_func, new_columns):
"""Return a list of unstacked blocks of self
Parameters
----------
unstacker_func : callable
Partially applied unstacker.
new_columns : Index
All columns of the unstacked BlockManager.
Returns
-------
blocks : list of Block
New blocks of unstacked values.
mask : array_like of bool
The mask of columns of `blocks` we should keep.
"""
# NonConsolidatable blocks can have a single item only, so we return
# one block per item
unstacker = unstacker_func(self.values.T)
new_items = unstacker.get_new_columns()
new_placement = new_columns.get_indexer(new_items)
new_values, mask = unstacker.get_new_values()
mask = mask.any(0)
new_values = new_values.T[mask]
new_placement = new_placement[mask]
blocks = [self.make_block_same_class(vals, [place])
for vals, place in zip(new_values, new_placement)]
return blocks, mask
class ExtensionBlock(NonConsolidatableMixIn, Block):
"""Block for holding extension types.
Notes
-----
This holds all 3rd-party extension array types. It's also the immediate
parent class for our internal extension types' blocks, CategoricalBlock.
ExtensionArrays are limited to 1-D.
"""
is_extension = True
def __init__(self, values, placement, ndim=None):
values = self._maybe_coerce_values(values)
super(ExtensionBlock, self).__init__(values, placement, ndim)
def _maybe_coerce_values(self, values):
"""Unbox to an extension array.
This will unbox an ExtensionArray stored in an Index or Series.
ExtensionArrays pass through. No dtype coercion is done.
Parameters
----------
values : Index, Series, ExtensionArray
Returns
-------
ExtensionArray
"""
if isinstance(values, (ABCIndexClass, ABCSeries)):
values = values._values
return values
@property
def _holder(self):
# For extension blocks, the holder is values-dependent.
return type(self.values)
@property
def fill_value(self):
# Used in reindex_indexer
return self.values.dtype.na_value
@property
def _can_hold_na(self):
# The default ExtensionArray._can_hold_na is True
return self._holder._can_hold_na
@property
def is_view(self):
"""Extension arrays are never treated as views."""
return False
def setitem(self, indexer, value, mgr=None):
"""Set the value inplace, returning a same-typed block.
This differs from Block.setitem by not allowing setitem to change
the dtype of the Block.
Parameters
----------
indexer : tuple, list-like, array-like, slice
The subset of self.values to set
value : object
The value being set
mgr : BlockPlacement, optional
Returns
-------
Block
Notes
-----
`indexer` is a direct slice/positional indexer. `value` must
be a compatible shape.
"""
if isinstance(indexer, tuple):
# we are always 1-D
indexer = indexer[0]
check_setitem_lengths(indexer, value, self.values)
self.values[indexer] = value
return self
def get_values(self, dtype=None):
# ExtensionArrays must be iterable, so this works.
values = np.asarray(self.values)
if values.ndim == self.ndim - 1:
values = values.reshape((1,) + values.shape)
return values
def to_dense(self):
return np.asarray(self.values)
def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None):
"""
Take values according to indexer and return them as a block.
"""
if fill_tuple is None:
fill_value = None
else:
fill_value = fill_tuple[0]
# axis doesn't matter; we are really a single-dim object
# but are passed the axis depending on the calling routing
# if its REALLY axis 0, then this will be a reindex and not a take
new_values = self.values.take(indexer, fill_value=fill_value,
allow_fill=True)
# if we are a 1-dim object, then always place at 0
if self.ndim == 1:
new_mgr_locs = [0]
else:
if new_mgr_locs is None:
new_mgr_locs = self.mgr_locs
return self.make_block_same_class(new_values, new_mgr_locs)
def _can_hold_element(self, element):
# XXX: We may need to think about pushing this onto the array.
# We're doing the same as CategoricalBlock here.
return True
def _slice(self, slicer):
""" return a slice of my values """
# slice the category
# return same dims as we currently have
if isinstance(slicer, tuple) and len(slicer) == 2:
if not com.is_null_slice(slicer[0]):
raise AssertionError("invalid slicing for a 1-ndim "
"categorical")
slicer = slicer[1]
return self.values[slicer]
def formatting_values(self):
return self.values._formatting_values()
def concat_same_type(self, to_concat, placement=None):
"""
Concatenate list of single blocks of the same type.
"""
values = self._holder._concat_same_type(
[blk.values for blk in to_concat])
placement = placement or slice(0, len(values), 1)
return self.make_block_same_class(values, ndim=self.ndim,
placement=placement)
def fillna(self, value, limit=None, inplace=False, downcast=None,
mgr=None):
values = self.values if inplace else self.values.copy()
values = values.fillna(value=value, limit=limit)
return [self.make_block_same_class(values=values,
placement=self.mgr_locs,
ndim=self.ndim)]
def interpolate(self, method='pad', axis=0, inplace=False, limit=None,
fill_value=None, **kwargs):
values = self.values if inplace else self.values.copy()
return self.make_block_same_class(
values=values.fillna(value=fill_value, method=method,
limit=limit),
placement=self.mgr_locs)
class NumericBlock(Block):
__slots__ = ()
is_numeric = True
_can_hold_na = True
class FloatOrComplexBlock(NumericBlock):
__slots__ = ()
def equals(self, other):
if self.dtype != other.dtype or self.shape != other.shape:
return False
left, right = self.values, other.values
return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
class FloatBlock(FloatOrComplexBlock):
__slots__ = ()
is_float = True
def _can_hold_element(self, element):
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
return (issubclass(tipo.type, (np.floating, np.integer)) and
not issubclass(tipo.type, (np.datetime64, np.timedelta64)))
return (
isinstance(
element, (float, int, np.floating, np.int_, compat.long))
and not isinstance(element, (bool, np.bool_, datetime, timedelta,
np.datetime64, np.timedelta64)))
def to_native_types(self, slicer=None, na_rep='', float_format=None,
decimal='.', quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """
values = self.values
if slicer is not None:
values = values[:, slicer]
# see gh-13418: no special formatting is desired at the
# output (important for appropriate 'quoting' behaviour),
# so do not pass it through the FloatArrayFormatter
if float_format is None and decimal == '.':
mask = isna(values)
if not quoting:
values = values.astype(str)
else:
values = np.array(values, dtype='object')
values[mask] = na_rep
return values
from pandas.io.formats.format import FloatArrayFormatter
formatter = FloatArrayFormatter(values, na_rep=na_rep,
float_format=float_format,
decimal=decimal, quoting=quoting,
fixed_width=False)
return formatter.get_result_as_array()
def should_store(self, value):
# when inserting a column should not coerce integers to floats
# unnecessarily
return (issubclass(value.dtype.type, np.floating) and
value.dtype == self.dtype)
class ComplexBlock(FloatOrComplexBlock):
__slots__ = ()
is_complex = True
def _can_hold_element(self, element):
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
return issubclass(tipo.type,
(np.floating, np.integer, np.complexfloating))
return (
isinstance(
element,
(float, int, complex, np.float_, np.int_, compat.long))
and not isinstance(element, (bool, np.bool_)))
def should_store(self, value):
return issubclass(value.dtype.type, np.complexfloating)
class IntBlock(NumericBlock):
__slots__ = ()
is_integer = True
_can_hold_na = False
def _can_hold_element(self, element):
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
return (issubclass(tipo.type, np.integer) and
not issubclass(tipo.type, (np.datetime64,
np.timedelta64)) and
self.dtype.itemsize >= tipo.itemsize)
return is_integer(element)
def should_store(self, value):
return is_integer_dtype(value) and value.dtype == self.dtype
class DatetimeLikeBlockMixin(object):
"""Mixin class for DatetimeBlock and DatetimeTZBlock."""
@property
def _holder(self):
return DatetimeIndex
@property
def _na_value(self):
return tslib.NaT
@property
def fill_value(self):
return tslib.iNaT
def get_values(self, dtype=None):
"""
return object dtype as boxed values, such as Timestamps/Timedelta
"""
if is_object_dtype(dtype):
return lib.map_infer(self.values.ravel(),
self._box_func).reshape(self.values.shape)
return self.values
class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock):
__slots__ = ()
is_timedelta = True
_can_hold_na = True
is_numeric = False
def __init__(self, values, placement, ndim=None):
if values.dtype != _TD_DTYPE:
values = conversion.ensure_timedelta64ns(values)
super(TimeDeltaBlock, self).__init__(values,
placement=placement, ndim=ndim)
@property
def _holder(self):
return TimedeltaIndex
@property
def _box_func(self):
return lambda x: tslib.Timedelta(x, unit='ns')
def _can_hold_element(self, element):
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
return issubclass(tipo.type, np.timedelta64)
return is_integer(element) or isinstance(
element, (timedelta, np.timedelta64))
def fillna(self, value, **kwargs):
# allow filling with integers to be
# interpreted as seconds
if is_integer(value) and not isinstance(value, np.timedelta64):
value = Timedelta(value, unit='s')
return super(TimeDeltaBlock, self).fillna(value, **kwargs)
def _try_coerce_args(self, values, other):
"""
Coerce values and other to int64, with null values converted to
iNaT. values is always ndarray-like, other may not be
Parameters
----------
values : ndarray-like
other : ndarray-like or scalar
Returns
-------
base-type values, values mask, base-type other, other mask
"""
values_mask = isna(values)
values = values.view('i8')
other_mask = False
if isinstance(other, bool):
raise TypeError
elif is_null_datelike_scalar(other):
other = tslib.iNaT
other_mask = True
elif isinstance(other, Timedelta):
other_mask = isna(other)
other = other.value
elif isinstance(other, timedelta):
other = Timedelta(other).value
elif isinstance(other, np.timedelta64):
other_mask = isna(other)
other = Timedelta(other).value
elif hasattr(other, 'dtype') and is_timedelta64_dtype(other):
other_mask = isna(other)
other = other.astype('i8', copy=False).view('i8')
else:
# coercion issues
# let higher levels handle
raise TypeError
return values, values_mask, other, other_mask
def _try_coerce_result(self, result):
""" reverse of try_coerce_args / try_operate """
if isinstance(result, np.ndarray):
mask = isna(result)
if result.dtype.kind in ['i', 'f', 'O']:
result = result.astype('m8[ns]')
result[mask] = tslib.iNaT
elif isinstance(result, (np.integer, np.float)):
result = self._box_func(result)
return result
def should_store(self, value):
return issubclass(value.dtype.type, np.timedelta64)
def to_native_types(self, slicer=None, na_rep=None, quoting=None,
**kwargs):
""" convert to our native types format, slicing if desired """
values = self.values
if slicer is not None:
values = values[:, slicer]
mask = isna(values)
rvalues = np.empty(values.shape, dtype=object)
if na_rep is None:
na_rep = 'NaT'
rvalues[mask] = na_rep
imask = (~mask).ravel()
# FIXME:
# should use the formats.format.Timedelta64Formatter here
# to figure what format to pass to the Timedelta
# e.g. to not show the decimals say
rvalues.flat[imask] = np.array([Timedelta(val)._repr_base(format='all')
for val in values.ravel()[imask]],
dtype=object)
return rvalues
class BoolBlock(NumericBlock):
__slots__ = ()
is_bool = True
_can_hold_na = False
def _can_hold_element(self, element):
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
return issubclass(tipo.type, np.bool_)
return isinstance(element, (bool, np.bool_))
def should_store(self, value):
return issubclass(value.dtype.type, np.bool_)
def replace(self, to_replace, value, inplace=False, filter=None,
regex=False, convert=True, mgr=None):
inplace = validate_bool_kwarg(inplace, 'inplace')
to_replace_values = np.atleast_1d(to_replace)
if not np.can_cast(to_replace_values, bool):
return self
return super(BoolBlock, self).replace(to_replace, value,
inplace=inplace, filter=filter,
regex=regex, convert=convert,
mgr=mgr)
class ObjectBlock(Block):
__slots__ = ()
is_object = True
_can_hold_na = True
def __init__(self, values, placement=None, ndim=2):
if issubclass(values.dtype.type, compat.string_types):
values = np.array(values, dtype=object)
super(ObjectBlock, self).__init__(values, ndim=ndim,
placement=placement)
@property
def is_bool(self):
""" we can be a bool if we have only bool values but are of type
object
"""
return lib.is_bool_array(self.values.ravel())
# TODO: Refactor when convert_objects is removed since there will be 1 path
def convert(self, *args, **kwargs):
""" attempt to coerce any object types to better types return a copy of
the block (if copy = True) by definition we ARE an ObjectBlock!!!!!
can return multiple blocks!
"""
if args:
raise NotImplementedError
by_item = True if 'by_item' not in kwargs else kwargs['by_item']
new_inputs = ['coerce', 'datetime', 'numeric', 'timedelta']
new_style = False
for kw in new_inputs:
new_style |= kw in kwargs
if new_style:
fn = soft_convert_objects
fn_inputs = new_inputs
else:
fn = maybe_convert_objects
fn_inputs = ['convert_dates', 'convert_numeric',
'convert_timedeltas']
fn_inputs += ['copy']
fn_kwargs = {}
for key in fn_inputs:
if key in kwargs:
fn_kwargs[key] = kwargs[key]
# operate column-by-column
def f(m, v, i):
shape = v.shape
values = fn(v.ravel(), **fn_kwargs)
try:
values = values.reshape(shape)
values = _block_shape(values, ndim=self.ndim)
except (AttributeError, NotImplementedError):
pass
return values
if by_item and not self._is_single_block:
blocks = self.split_and_operate(None, f, False)
else:
values = f(None, self.values.ravel(), None)
blocks = [make_block(values, ndim=self.ndim,
placement=self.mgr_locs)]
return blocks
def set(self, locs, values, check=False):
"""
Modify Block in-place with new item value
Returns
-------
None
"""
# GH6026
if check:
try:
if (self.values[locs] == values).all():
return
except:
pass
try:
self.values[locs] = values
except (ValueError):
# broadcasting error
# see GH6171
new_shape = list(values.shape)
new_shape[0] = len(self.items)
self.values = np.empty(tuple(new_shape), dtype=self.dtype)
self.values.fill(np.nan)
self.values[locs] = values
def _maybe_downcast(self, blocks, downcast=None):
if downcast is not None:
return blocks
# split and convert the blocks
return _extend_blocks([b.convert(datetime=True, numeric=False)
for b in blocks])
def _can_hold_element(self, element):
return True
def _try_coerce_args(self, values, other):
""" provide coercion to our input arguments """
if isinstance(other, ABCDatetimeIndex):
# to store DatetimeTZBlock as object
other = other.astype(object).values
return values, False, other, False
def should_store(self, value):
return not (issubclass(value.dtype.type,
(np.integer, np.floating, np.complexfloating,
np.datetime64, np.bool_)) or
# TODO(ExtensionArray): remove is_extension_type
# when all extension arrays have been ported.
is_extension_type(value) or
is_extension_array_dtype(value))
def replace(self, to_replace, value, inplace=False, filter=None,
regex=False, convert=True, mgr=None):
to_rep_is_list = is_list_like(to_replace)
value_is_list = is_list_like(value)
both_lists = to_rep_is_list and value_is_list
either_list = to_rep_is_list or value_is_list
result_blocks = []
blocks = [self]
if not either_list and is_re(to_replace):
return self._replace_single(to_replace, value, inplace=inplace,
filter=filter, regex=True,
convert=convert, mgr=mgr)
elif not (either_list or regex):
return super(ObjectBlock, self).replace(to_replace, value,
inplace=inplace,
filter=filter, regex=regex,
convert=convert, mgr=mgr)
elif both_lists:
for to_rep, v in zip(to_replace, value):
result_blocks = []
for b in blocks:
result = b._replace_single(to_rep, v, inplace=inplace,
filter=filter, regex=regex,
convert=convert, mgr=mgr)
result_blocks = _extend_blocks(result, result_blocks)
blocks = result_blocks
return result_blocks
elif to_rep_is_list and regex:
for to_rep in to_replace:
result_blocks = []
for b in blocks:
result = b._replace_single(to_rep, value, inplace=inplace,
filter=filter, regex=regex,
convert=convert, mgr=mgr)
result_blocks = _extend_blocks(result, result_blocks)
blocks = result_blocks
return result_blocks
return self._replace_single(to_replace, value, inplace=inplace,
filter=filter, convert=convert,
regex=regex, mgr=mgr)
def _replace_single(self, to_replace, value, inplace=False, filter=None,
regex=False, convert=True, mgr=None):
inplace = validate_bool_kwarg(inplace, 'inplace')
# to_replace is regex compilable
to_rep_re = regex and is_re_compilable(to_replace)
# regex is regex compilable
regex_re = is_re_compilable(regex)
# only one will survive
if to_rep_re and regex_re:
raise AssertionError('only one of to_replace and regex can be '
'regex compilable')
# if regex was passed as something that can be a regex (rather than a
# boolean)
if regex_re:
to_replace = regex
regex = regex_re or to_rep_re
# try to get the pattern attribute (compiled re) or it's a string
try:
pattern = to_replace.pattern
except AttributeError:
pattern = to_replace
# if the pattern is not empty and to_replace is either a string or a
# regex
if regex and pattern:
rx = re.compile(to_replace)
else:
# if the thing to replace is not a string or compiled regex call
# the superclass method -> to_replace is some kind of object
return super(ObjectBlock, self).replace(to_replace, value,
inplace=inplace,
filter=filter, regex=regex,
mgr=mgr)
new_values = self.values if inplace else self.values.copy()
# deal with replacing values with objects (strings) that match but
# whose replacement is not a string (numeric, nan, object)
if isna(value) or not isinstance(value, compat.string_types):
def re_replacer(s):
try:
return value if rx.search(s) is not None else s
except TypeError:
return s
else:
# value is guaranteed to be a string here, s can be either a string
# or null if it's null it gets returned
def re_replacer(s):
try:
return rx.sub(value, s)
except TypeError:
return s
f = np.vectorize(re_replacer, otypes=[self.dtype])
if filter is None:
filt = slice(None)
else:
filt = self.mgr_locs.isin(filter).nonzero()[0]
new_values[filt] = f(new_values[filt])
# convert
block = self.make_block(new_values)
if convert:
block = block.convert(by_item=True, numeric=False)
return block
class CategoricalBlock(ExtensionBlock):
__slots__ = ()
is_categorical = True
_verify_integrity = True
_can_hold_na = True
_concatenator = staticmethod(_concat._concat_categorical)
def __init__(self, values, placement, ndim=None):
from pandas.core.arrays.categorical import _maybe_to_categorical
# coerce to categorical if we can
super(CategoricalBlock, self).__init__(_maybe_to_categorical(values),
placement=placement,
ndim=ndim)
@property
def _holder(self):
return Categorical
@property
def array_dtype(self):
""" the dtype to return if I want to construct this block as an
array
"""
return np.object_
def _try_coerce_result(self, result):
""" reverse of try_coerce_args """
# GH12564: CategoricalBlock is 1-dim only
# while returned results could be any dim
if ((not is_categorical_dtype(result)) and
isinstance(result, np.ndarray)):
result = _block_shape(result, ndim=self.ndim)
return result
def shift(self, periods, axis=0, mgr=None):
return self.make_block_same_class(values=self.values.shift(periods),
placement=self.mgr_locs)
def to_dense(self):
# Categorical.get_values returns a DatetimeIndex for datetime
# categories, so we can't simply use `np.asarray(self.values)` like
# other types.
return self.values.get_values()
def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """
values = self.values
if slicer is not None:
# Categorical is always one dimension
values = values[slicer]
mask = isna(values)
values = np.array(values, dtype='object')
values[mask] = na_rep
# we are expected to return a 2-d ndarray
return values.reshape(1, len(values))
def concat_same_type(self, to_concat, placement=None):
"""
Concatenate list of single blocks of the same type.
Note that this CategoricalBlock._concat_same_type *may* not
return a CategoricalBlock. When the categories in `to_concat`
differ, this will return an object ndarray.
If / when we decide we don't like that behavior:
1. Change Categorical._concat_same_type to use union_categoricals
2. Delete this method.
"""
values = self._concatenator([blk.values for blk in to_concat],
axis=self.ndim - 1)
# not using self.make_block_same_class as values can be object dtype
return make_block(
values, placement=placement or slice(0, len(values), 1),
ndim=self.ndim)
class DatetimeBlock(DatetimeLikeBlockMixin, Block):
__slots__ = ()
is_datetime = True
_can_hold_na = True
def __init__(self, values, placement, ndim=None):
values = self._maybe_coerce_values(values)
super(DatetimeBlock, self).__init__(values,
placement=placement, ndim=ndim)
def _maybe_coerce_values(self, values):
"""Input validation for values passed to __init__. Ensure that
we have datetime64ns, coercing if necessary.
Parameters
----------
values : array-like
Must be convertible to datetime64
Returns
-------
values : ndarray[datetime64ns]
Overridden by DatetimeTZBlock.
"""
if values.dtype != _NS_DTYPE:
values = conversion.ensure_datetime64ns(values)
return values
def _astype(self, dtype, mgr=None, **kwargs):
"""
these automatically copy, so copy=True has no effect
raise on an except if raise == True
"""
# if we are passed a datetime64[ns, tz]
if is_datetime64tz_dtype(dtype):
dtype = DatetimeTZDtype(dtype)
values = self.values
if getattr(values, 'tz', None) is None:
values = DatetimeIndex(values).tz_localize('UTC')
values = values.tz_convert(dtype.tz)
return self.make_block(values)
# delegate
return super(DatetimeBlock, self)._astype(dtype=dtype, **kwargs)
def _can_hold_element(self, element):
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
# TODO: this still uses asarray, instead of dtype.type
element = np.array(element)
return element.dtype == _NS_DTYPE or element.dtype == np.int64
return (is_integer(element) or isinstance(element, datetime) or
isna(element))
def _try_coerce_args(self, values, other):
"""
Coerce values and other to dtype 'i8'. NaN and NaT convert to
the smallest i8, and will correctly round-trip to NaT if converted
back in _try_coerce_result. values is always ndarray-like, other
may not be
Parameters
----------
values : ndarray-like
other : ndarray-like or scalar
Returns
-------
base-type values, values mask, base-type other, other mask
"""
values_mask = isna(values)
values = values.view('i8')
other_mask = False
if isinstance(other, bool):
raise TypeError
elif is_null_datelike_scalar(other):
other = tslib.iNaT
other_mask = True
elif isinstance(other, (datetime, np.datetime64, date)):
other = self._box_func(other)
if getattr(other, 'tz') is not None:
raise TypeError("cannot coerce a Timestamp with a tz on a "
"naive Block")
other_mask = isna(other)
other = other.asm8.view('i8')
elif hasattr(other, 'dtype') and is_datetime64_dtype(other):
other_mask = isna(other)
other = other.astype('i8', copy=False).view('i8')
else:
# coercion issues
# let higher levels handle
raise TypeError
return values, values_mask, other, other_mask
def _try_coerce_result(self, result):
""" reverse of try_coerce_args """
if isinstance(result, np.ndarray):
if result.dtype.kind in ['i', 'f', 'O']:
try:
result = result.astype('M8[ns]')
except ValueError:
pass
elif isinstance(result, (np.integer, np.float, np.datetime64)):
result = self._box_func(result)
return result
@property
def _box_func(self):
return tslib.Timestamp
def to_native_types(self, slicer=None, na_rep=None, date_format=None,
quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """
values = self.values
if slicer is not None:
values = values[..., slicer]
from pandas.io.formats.format import _get_format_datetime64_from_values
format = _get_format_datetime64_from_values(values, date_format)
result = tslib.format_array_from_datetime(
values.view('i8').ravel(), tz=getattr(self.values, 'tz', None),
format=format, na_rep=na_rep).reshape(values.shape)
return np.atleast_2d(result)
def should_store(self, value):
return (issubclass(value.dtype.type, np.datetime64) and
not is_datetimetz(value))
def set(self, locs, values, check=False):
"""
Modify Block in-place with new item value
Returns
-------
None
"""
if values.dtype != _NS_DTYPE:
# Workaround for numpy 1.6 bug
values = conversion.ensure_datetime64ns(values)
self.values[locs] = values
class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock):
""" implement a datetime64 block with a tz attribute """
__slots__ = ()
_concatenator = staticmethod(_concat._concat_datetime)
is_datetimetz = True
def __init__(self, values, placement, ndim=2, dtype=None):
# XXX: This will end up calling _maybe_coerce_values twice
# when dtype is not None. It's relatively cheap (just an isinstance)
# but it'd nice to avoid.
#
# If we can remove dtype from __init__, and push that conversion
# push onto the callers, then we can remove this entire __init__
# and just use DatetimeBlock's.
if dtype is not None:
values = self._maybe_coerce_values(values, dtype=dtype)
super(DatetimeTZBlock, self).__init__(values, placement=placement,
ndim=ndim)
def _maybe_coerce_values(self, values, dtype=None):
"""Input validation for values passed to __init__. Ensure that
we have datetime64TZ, coercing if necessary.
Parametetrs
-----------
values : array-like
Must be convertible to datetime64
dtype : string or DatetimeTZDtype, optional
Does a shallow copy to this tz
Returns
-------
values : ndarray[datetime64ns]
"""
if not isinstance(values, self._holder):
values = self._holder(values)
if dtype is not None:
if isinstance(dtype, compat.string_types):
dtype = DatetimeTZDtype.construct_from_string(dtype)
values = values._shallow_copy(tz=dtype.tz)
if values.tz is None:
raise ValueError("cannot create a DatetimeTZBlock without a tz")
return values
@property
def is_view(self):
""" return a boolean if I am possibly a view """
# check the ndarray values of the DatetimeIndex values
return self.values.values.base is not None
def copy(self, deep=True, mgr=None):
""" copy constructor """
values = self.values
if deep:
values = values.copy(deep=True)
return self.make_block_same_class(values)
def external_values(self):
""" we internally represent the data as a DatetimeIndex, but for
external compat with ndarray, export as a ndarray of Timestamps
"""
return self.values.astype('datetime64[ns]').values
def get_values(self, dtype=None):
# return object dtype as Timestamps with the zones
if is_object_dtype(dtype):
return lib.map_infer(
self.values.ravel(), self._box_func).reshape(self.values.shape)
return self.values
def _slice(self, slicer):
""" return a slice of my values """
if isinstance(slicer, tuple):
col, loc = slicer
if not com.is_null_slice(col) and col != 0:
raise IndexError("{0} only contains one item".format(self))
return self.values[loc]
return self.values[slicer]
def _try_coerce_args(self, values, other):
"""
localize and return i8 for the values
Parameters
----------
values : ndarray-like
other : ndarray-like or scalar
Returns
-------
base-type values, values mask, base-type other, other mask
"""
values_mask = _block_shape(isna(values), ndim=self.ndim)
# asi8 is a view, needs copy
values = _block_shape(values.asi8, ndim=self.ndim)
other_mask = False
if isinstance(other, ABCSeries):
other = self._holder(other)
other_mask = isna(other)
if isinstance(other, bool):
raise TypeError
elif (is_null_datelike_scalar(other) or
(is_scalar(other) and isna(other))):
other = tslib.iNaT
other_mask = True
elif isinstance(other, self._holder):
if other.tz != self.values.tz:
raise ValueError("incompatible or non tz-aware value")
other = other.asi8
other_mask = isna(other)
elif isinstance(other, (np.datetime64, datetime, date)):
other = tslib.Timestamp(other)
tz = getattr(other, 'tz', None)
# test we can have an equal time zone
if tz is None or str(tz) != str(self.values.tz):
raise ValueError("incompatible or non tz-aware value")
other_mask = isna(other)
other = other.value
else:
raise TypeError
return values, values_mask, other, other_mask
def _try_coerce_result(self, result):
""" reverse of try_coerce_args """
if isinstance(result, np.ndarray):
if result.dtype.kind in ['i', 'f', 'O']:
result = result.astype('M8[ns]')
elif isinstance(result, (np.integer, np.float, np.datetime64)):
result = tslib.Timestamp(result, tz=self.values.tz)
if isinstance(result, np.ndarray):
# allow passing of > 1dim if its trivial
if result.ndim > 1:
result = result.reshape(np.prod(result.shape))
result = self.values._shallow_copy(result)
return result
@property
def _box_func(self):
return lambda x: tslib.Timestamp(x, tz=self.dtype.tz)
def shift(self, periods, axis=0, mgr=None):
""" shift the block by periods """
# think about moving this to the DatetimeIndex. This is a non-freq
# (number of periods) shift ###
N = len(self)
indexer = np.zeros(N, dtype=int)
if periods > 0:
indexer[periods:] = np.arange(N - periods)
else:
indexer[:periods] = np.arange(-periods, N)
new_values = self.values.asi8.take(indexer)
if periods > 0:
new_values[:periods] = tslib.iNaT
else:
new_values[periods:] = tslib.iNaT
new_values = self.values._shallow_copy(new_values)
return [self.make_block_same_class(new_values,
placement=self.mgr_locs)]
def diff(self, n, axis=0, mgr=None):
"""1st discrete difference
Parameters
----------
n : int, number of periods to diff
axis : int, axis to diff upon. default 0
mgr : default None
Return
------
A list with a new TimeDeltaBlock.
Note
----
The arguments here are mimicking shift so they are called correctly
by apply.
"""
if axis == 0:
# Cannot currently calculate diff across multiple blocks since this
# function is invoked via apply
raise NotImplementedError
new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8
# Reshape the new_values like how algos.diff does for timedelta data
new_values = new_values.reshape(1, len(new_values))
new_values = new_values.astype('timedelta64[ns]')
return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)]
def concat_same_type(self, to_concat, placement=None):
"""
Concatenate list of single blocks of the same type.
"""
values = self._concatenator([blk.values for blk in to_concat],
axis=self.ndim - 1)
# not using self.make_block_same_class as values can be non-tz dtype
return make_block(
values, placement=placement or slice(0, len(values), 1))
class SparseBlock(NonConsolidatableMixIn, Block):
""" implement as a list of sparse arrays of the same dtype """
__slots__ = ()
is_sparse = True
is_numeric = True
_box_to_block_values = False
_can_hold_na = True
_ftype = 'sparse'
_concatenator = staticmethod(_concat._concat_sparse)
def __init__(self, values, placement, ndim=None):
# Ensure that we have the underlying SparseArray here...
if isinstance(values, ABCSeries):
values = values.values
assert isinstance(values, SparseArray)
super(SparseBlock, self).__init__(values, placement, ndim=ndim)
@property
def _holder(self):
return SparseArray
@property
def shape(self):
return (len(self.mgr_locs), self.sp_index.length)
@property
def fill_value(self):
# return np.nan
return self.values.fill_value
@fill_value.setter
def fill_value(self, v):
self.values.fill_value = v
def to_dense(self):
return self.values.to_dense().view()
@property
def sp_values(self):
return self.values.sp_values
@sp_values.setter
def sp_values(self, v):
# reset the sparse values
self.values = SparseArray(v, sparse_index=self.sp_index,
kind=self.kind, dtype=v.dtype,
fill_value=self.values.fill_value,
copy=False)
@property
def sp_index(self):
return self.values.sp_index
@property
def kind(self):
return self.values.kind
def _astype(self, dtype, copy=False, errors='raise', values=None,
klass=None, mgr=None, **kwargs):
if values is None:
values = self.values
values = values.astype(dtype, copy=copy)
return self.make_block_same_class(values=values,
placement=self.mgr_locs)
def __len__(self):
try:
return self.sp_index.length
except:
return 0
def copy(self, deep=True, mgr=None):
return self.make_block_same_class(values=self.values,
sparse_index=self.sp_index,
kind=self.kind, copy=deep,
placement=self.mgr_locs)
def make_block_same_class(self, values, placement, sparse_index=None,
kind=None, dtype=None, fill_value=None,
copy=False, ndim=None):
""" return a new block """
if dtype is None:
dtype = values.dtype
if fill_value is None and not isinstance(values, SparseArray):
fill_value = self.values.fill_value
# if not isinstance(values, SparseArray) and values.ndim != self.ndim:
# raise ValueError("ndim mismatch")
if values.ndim == 2:
nitems = values.shape[0]
if nitems == 0:
# kludgy, but SparseBlocks cannot handle slices, where the
# output is 0-item, so let's convert it to a dense block: it
# won't take space since there's 0 items, plus it will preserve
# the dtype.
return self.make_block(np.empty(values.shape, dtype=dtype),
placement)
elif nitems > 1:
raise ValueError("Only 1-item 2d sparse blocks are supported")
else:
values = values.reshape(values.shape[1])
new_values = SparseArray(values, sparse_index=sparse_index,
kind=kind or self.kind, dtype=dtype,
fill_value=fill_value, copy=copy)
return self.make_block(new_values,
placement=placement)
def interpolate(self, method='pad', axis=0, inplace=False, limit=None,
fill_value=None, **kwargs):
values = missing.interpolate_2d(self.values.to_dense(), method, axis,
limit, fill_value)
return self.make_block_same_class(values=values,
placement=self.mgr_locs)
def fillna(self, value, limit=None, inplace=False, downcast=None,
mgr=None):
# we may need to upcast our fill to match our dtype
if limit is not None:
raise NotImplementedError("specifying a limit for 'fillna' has "
"not been implemented yet")
values = self.values if inplace else self.values.copy()
values = values.fillna(value, downcast=downcast)
return [self.make_block_same_class(values=values,
placement=self.mgr_locs)]
def shift(self, periods, axis=0, mgr=None):
""" shift the block by periods """
N = len(self.values.T)
indexer = np.zeros(N, dtype=int)
if periods > 0:
indexer[periods:] = np.arange(N - periods)
else:
indexer[:periods] = np.arange(-periods, N)
new_values = self.values.to_dense().take(indexer)
# convert integer to float if necessary. need to do a lot more than
# that, handle boolean etc also
new_values, fill_value = maybe_upcast(new_values)
if periods > 0:
new_values[:periods] = fill_value
else:
new_values[periods:] = fill_value
return [self.make_block_same_class(new_values,
placement=self.mgr_locs)]
def sparse_reindex(self, new_index):
""" sparse reindex and return a new block
current reindex only works for float64 dtype! """
values = self.values
values = values.sp_index.to_int_index().reindex(
values.sp_values.astype('float64'), values.fill_value, new_index)
return self.make_block_same_class(values, sparse_index=new_index,
placement=self.mgr_locs)
def get_block_type(values, dtype=None):
"""
Find the appropriate Block subclass to use for the given values and dtype.
Parameters
----------
values : ndarray-like
dtype : numpy or pandas dtype
Returns
-------
cls : class, subclass of Block
"""
dtype = dtype or values.dtype
vtype = dtype.type
if is_sparse(values):
cls = SparseBlock
elif issubclass(vtype, np.floating):
cls = FloatBlock
elif issubclass(vtype, np.timedelta64):
assert issubclass(vtype, np.integer)
cls = TimeDeltaBlock
elif issubclass(vtype, np.complexfloating):
cls = ComplexBlock
elif issubclass(vtype, np.datetime64):
assert not is_datetimetz(values)
cls = DatetimeBlock
elif is_datetimetz(values):
cls = DatetimeTZBlock
elif issubclass(vtype, np.integer):
cls = IntBlock
elif dtype == np.bool_:
cls = BoolBlock
elif is_categorical(values):
cls = CategoricalBlock
elif is_extension_array_dtype(values):
cls = ExtensionBlock
else:
cls = ObjectBlock
return cls
def make_block(values, placement, klass=None, ndim=None, dtype=None,
fastpath=None):
if fastpath is not None:
# GH#19265 pyarrow is passing this
warnings.warn("fastpath argument is deprecated, will be removed "
"in a future release.", DeprecationWarning)
if klass is None:
dtype = dtype or values.dtype
klass = get_block_type(values, dtype)
elif klass is DatetimeTZBlock and not is_datetimetz(values):
return klass(values, ndim=ndim,
placement=placement, dtype=dtype)
return klass(values, ndim=ndim, placement=placement)
# TODO: flexible with index=None and/or items=None
class BlockManager(PandasObject):
"""
Core internal data structure to implement DataFrame, Series, Panel, etc.
Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a
lightweight blocked set of labeled data to be manipulated by the DataFrame
public API class
Attributes
----------
shape
ndim
axes
values
items
Methods
-------
set_axis(axis, new_labels)
copy(deep=True)
get_dtype_counts
get_ftype_counts
get_dtypes
get_ftypes
apply(func, axes, block_filter_fn)
get_bool_data
get_numeric_data
get_slice(slice_like, axis)
get(label)
iget(loc)
get_scalar(label_tup)
take(indexer, axis)
reindex_axis(new_labels, axis)
reindex_indexer(new_labels, indexer, axis)
delete(label)
insert(loc, label, value)
set(label, value)
Parameters
----------
Notes
-----
This is *not* a public API class
"""
__slots__ = ['axes', 'blocks', '_ndim', '_shape', '_known_consolidated',
'_is_consolidated', '_blknos', '_blklocs']
def __init__(self, blocks, axes, do_integrity_check=True):
self.axes = [_ensure_index(ax) for ax in axes]
self.blocks = tuple(blocks)
for block in blocks:
if block.is_sparse:
if len(block.mgr_locs) != 1:
raise AssertionError("Sparse block refers to multiple "
"items")
else:
if self.ndim != block.ndim:
raise AssertionError(
'Number of Block dimensions ({block}) must equal '
'number of axes ({self})'.format(block=block.ndim,
self=self.ndim))
if do_integrity_check:
self._verify_integrity()
self._consolidate_check()
self._rebuild_blknos_and_blklocs()
def make_empty(self, axes=None):
""" return an empty BlockManager with the items axis of len 0 """
if axes is None:
axes = [_ensure_index([])] + [_ensure_index(a)
for a in self.axes[1:]]
# preserve dtype if possible
if self.ndim == 1:
blocks = np.array([], dtype=self.array_dtype)
else:
blocks = []
return self.__class__(blocks, axes)
def __nonzero__(self):
return True
# Python3 compat
__bool__ = __nonzero__
@property
def shape(self):
return tuple(len(ax) for ax in self.axes)
@property
def ndim(self):
return len(self.axes)
def set_axis(self, axis, new_labels):
new_labels = _ensure_index(new_labels)
old_len = len(self.axes[axis])
new_len = len(new_labels)
if new_len != old_len:
raise ValueError(
'Length mismatch: Expected axis has {old} elements, new '
'values have {new} elements'.format(old=old_len, new=new_len))
self.axes[axis] = new_labels
def rename_axis(self, mapper, axis, copy=True, level=None):
"""
Rename one of axes.
Parameters
----------
mapper : unary callable
axis : int
copy : boolean, default True
level : int, default None
"""
obj = self.copy(deep=copy)
obj.set_axis(axis, _transform_index(self.axes[axis], mapper, level))
return obj
def add_prefix(self, prefix):
f = partial('{prefix}{}'.format, prefix=prefix)
return self.rename_axis(f, axis=0)
def add_suffix(self, suffix):
f = partial('{}{suffix}'.format, suffix=suffix)
return self.rename_axis(f, axis=0)
@property
def _is_single_block(self):
if self.ndim == 1:
return True
if len(self.blocks) != 1:
return False
blk = self.blocks[0]
return (blk.mgr_locs.is_slice_like and
blk.mgr_locs.as_slice == slice(0, len(self), 1))
def _rebuild_blknos_and_blklocs(self):
"""
Update mgr._blknos / mgr._blklocs.
"""
new_blknos = np.empty(self.shape[0], dtype=np.int64)
new_blklocs = np.empty(self.shape[0], dtype=np.int64)
new_blknos.fill(-1)
new_blklocs.fill(-1)
for blkno, blk in enumerate(self.blocks):
rl = blk.mgr_locs
new_blknos[rl.indexer] = blkno
new_blklocs[rl.indexer] = np.arange(len(rl))
if (new_blknos == -1).any():
raise AssertionError("Gaps in blk ref_locs")
self._blknos = new_blknos
self._blklocs = new_blklocs
# make items read only for now
def _get_items(self):
return self.axes[0]
items = property(fget=_get_items)
def _get_counts(self, f):
""" return a dict of the counts of the function in BlockManager """
self._consolidate_inplace()
counts = dict()
for b in self.blocks:
v = f(b)
counts[v] = counts.get(v, 0) + b.shape[0]
return counts
def get_dtype_counts(self):
return self._get_counts(lambda b: b.dtype.name)
def get_ftype_counts(self):
return self._get_counts(lambda b: b.ftype)
def get_dtypes(self):
dtypes = np.array([blk.dtype for blk in self.blocks])
return algos.take_1d(dtypes, self._blknos, allow_fill=False)
def get_ftypes(self):
ftypes = np.array([blk.ftype for blk in self.blocks])
return algos.take_1d(ftypes, self._blknos, allow_fill=False)
def __getstate__(self):
block_values = [b.values for b in self.blocks]
block_items = [self.items[b.mgr_locs.indexer] for b in self.blocks]
axes_array = [ax for ax in self.axes]
extra_state = {
'0.14.1': {
'axes': axes_array,
'blocks': [dict(values=b.values, mgr_locs=b.mgr_locs.indexer)
for b in self.blocks]
}
}
# First three elements of the state are to maintain forward
# compatibility with 0.13.1.
return axes_array, block_values, block_items, extra_state
def __setstate__(self, state):
def unpickle_block(values, mgr_locs):
# numpy < 1.7 pickle compat
if values.dtype == 'M8[us]':
values = values.astype('M8[ns]')
return make_block(values, placement=mgr_locs)
if (isinstance(state, tuple) and len(state) >= 4 and
'0.14.1' in state[3]):
state = state[3]['0.14.1']
self.axes = [_ensure_index(ax) for ax in state['axes']]
self.blocks = tuple(unpickle_block(b['values'], b['mgr_locs'])
for b in state['blocks'])
else:
# discard anything after 3rd, support beta pickling format for a
# little while longer
ax_arrays, bvalues, bitems = state[:3]
self.axes = [_ensure_index(ax) for ax in ax_arrays]
if len(bitems) == 1 and self.axes[0].equals(bitems[0]):
# This is a workaround for pre-0.14.1 pickles that didn't
# support unpickling multi-block frames/panels with non-unique
# columns/items, because given a manager with items ["a", "b",
# "a"] there's no way of knowing which block's "a" is where.
#
# Single-block case can be supported under the assumption that
# block items corresponded to manager items 1-to-1.
all_mgr_locs = [slice(0, len(bitems[0]))]
else:
all_mgr_locs = [self.axes[0].get_indexer(blk_items)
for blk_items in bitems]
self.blocks = tuple(
unpickle_block(values, mgr_locs)
for values, mgr_locs in zip(bvalues, all_mgr_locs))
self._post_setstate()
def _post_setstate(self):
self._is_consolidated = False
self._known_consolidated = False
self._rebuild_blknos_and_blklocs()
def __len__(self):
return len(self.items)
def __unicode__(self):
output = pprint_thing(self.__class__.__name__)
for i, ax in enumerate(self.axes):
if i == 0:
output += u('\nItems: {ax}'.format(ax=ax))
else:
output += u('\nAxis {i}: {ax}'.format(i=i, ax=ax))
for block in self.blocks:
output += u('\n{block}'.format(block=pprint_thing(block)))
return output
def _verify_integrity(self):
mgr_shape = self.shape
tot_items = sum(len(x.mgr_locs) for x in self.blocks)
for block in self.blocks:
if block._verify_integrity and block.shape[1:] != mgr_shape[1:]:
construction_error(tot_items, block.shape[1:], self.axes)
if len(self.items) != tot_items:
raise AssertionError('Number of manager items must equal union of '
'block items\n# manager items: {0}, # '
'tot_items: {1}'.format(
len(self.items), tot_items))
def apply(self, f, axes=None, filter=None, do_integrity_check=False,
consolidate=True, **kwargs):
"""
iterate over the blocks, collect and create a new block manager
Parameters
----------
f : the callable or function name to operate on at the block level
axes : optional (if not supplied, use self.axes)
filter : list, if supplied, only call the block if the filter is in
the block
do_integrity_check : boolean, default False. Do the block manager
integrity check
consolidate: boolean, default True. Join together blocks having same
dtype
Returns
-------
Block Manager (new object)
"""
result_blocks = []
# filter kwarg is used in replace-* family of methods
if filter is not None:
filter_locs = set(self.items.get_indexer_for(filter))
if len(filter_locs) == len(self.items):
# All items are included, as if there were no filtering
filter = None
else:
kwargs['filter'] = filter_locs
if consolidate:
self._consolidate_inplace()
if f == 'where':
align_copy = True
if kwargs.get('align', True):
align_keys = ['other', 'cond']
else:
align_keys = ['cond']
elif f == 'putmask':
align_copy = False
if kwargs.get('align', True):
align_keys = ['new', 'mask']
else:
align_keys = ['mask']
elif f == 'eval':
align_copy = False
align_keys = ['other']
elif f == 'fillna':
# fillna internally does putmask, maybe it's better to do this
# at mgr, not block level?
align_copy = False
align_keys = ['value']
else:
align_keys = []
# TODO(EA): may interfere with ExtensionBlock.setitem for blocks
# with a .values attribute.
aligned_args = dict((k, kwargs[k])
for k in align_keys
if hasattr(kwargs[k], 'values') and
not isinstance(kwargs[k], ABCExtensionArray))
for b in self.blocks:
if filter is not None:
if not b.mgr_locs.isin(filter_locs).any():
result_blocks.append(b)
continue
if aligned_args:
b_items = self.items[b.mgr_locs.indexer]
for k, obj in aligned_args.items():
axis = getattr(obj, '_info_axis_number', 0)
kwargs[k] = obj.reindex(b_items, axis=axis,
copy=align_copy)
kwargs['mgr'] = self
applied = getattr(b, f)(**kwargs)
result_blocks = _extend_blocks(applied, result_blocks)
if len(result_blocks) == 0:
return self.make_empty(axes or self.axes)
bm = self.__class__(result_blocks, axes or self.axes,
do_integrity_check=do_integrity_check)
bm._consolidate_inplace()
return bm
def reduction(self, f, axis=0, consolidate=True, transposed=False,
**kwargs):
"""
iterate over the blocks, collect and create a new block manager.
This routine is intended for reduction type operations and
will do inference on the generated blocks.
Parameters
----------
f: the callable or function name to operate on at the block level
axis: reduction axis, default 0
consolidate: boolean, default True. Join together blocks having same
dtype
transposed: boolean, default False
we are holding transposed data
Returns
-------
Block Manager (new object)
"""
if consolidate:
self._consolidate_inplace()
axes, blocks = [], []
for b in self.blocks:
kwargs['mgr'] = self
axe, block = getattr(b, f)(axis=axis, **kwargs)
axes.append(axe)
blocks.append(block)
# note that some DatetimeTZ, Categorical are always ndim==1
ndim = {b.ndim for b in blocks}
if 2 in ndim:
new_axes = list(self.axes)
# multiple blocks that are reduced
if len(blocks) > 1:
new_axes[1] = axes[0]
# reset the placement to the original
for b, sb in zip(blocks, self.blocks):
b.mgr_locs = sb.mgr_locs
else:
new_axes[axis] = Index(np.concatenate(
[ax.values for ax in axes]))
if transposed:
new_axes = new_axes[::-1]
blocks = [b.make_block(b.values.T,
placement=np.arange(b.shape[1])
) for b in blocks]
return self.__class__(blocks, new_axes)
# 0 ndim
if 0 in ndim and 1 not in ndim:
values = np.array([b.values for b in blocks])
if len(values) == 1:
return values.item()
blocks = [make_block(values, ndim=1)]
axes = Index([ax[0] for ax in axes])
# single block
values = _concat._concat_compat([b.values for b in blocks])
# compute the orderings of our original data
if len(self.blocks) > 1:
indexer = np.empty(len(self.axes[0]), dtype=np.intp)
i = 0
for b in self.blocks:
for j in b.mgr_locs:
indexer[j] = i
i = i + 1
values = values.take(indexer)
return SingleBlockManager(
[make_block(values,
ndim=1,
placement=np.arange(len(values)))],
axes[0])
def isna(self, func, **kwargs):
return self.apply('apply', func=func, **kwargs)
def where(self, **kwargs):
return self.apply('where', **kwargs)
def eval(self, **kwargs):
return self.apply('eval', **kwargs)
def quantile(self, **kwargs):
return self.reduction('quantile', **kwargs)
def setitem(self, **kwargs):
return self.apply('setitem', **kwargs)
def putmask(self, **kwargs):
return self.apply('putmask', **kwargs)
def diff(self, **kwargs):
return self.apply('diff', **kwargs)
def interpolate(self, **kwargs):
return self.apply('interpolate', **kwargs)
def shift(self, **kwargs):
return self.apply('shift', **kwargs)
def fillna(self, **kwargs):
return self.apply('fillna', **kwargs)
def downcast(self, **kwargs):
return self.apply('downcast', **kwargs)
def astype(self, dtype, **kwargs):
return self.apply('astype', dtype=dtype, **kwargs)
def convert(self, **kwargs):
return self.apply('convert', **kwargs)
def replace(self, **kwargs):
return self.apply('replace', **kwargs)
def replace_list(self, src_list, dest_list, inplace=False, regex=False,
mgr=None):
""" do a list replace """
inplace = validate_bool_kwarg(inplace, 'inplace')
if mgr is None:
mgr = self
# figure out our mask a-priori to avoid repeated replacements
values = self.as_array()
def comp(s):
if isna(s):
return isna(values)
return _maybe_compare(values, getattr(s, 'asm8', s), operator.eq)
masks = [comp(s) for i, s in enumerate(src_list)]
result_blocks = []
src_len = len(src_list) - 1
for blk in self.blocks:
# its possible to get multiple result blocks here
# replace ALWAYS will return a list
rb = [blk if inplace else blk.copy()]
for i, (s, d) in enumerate(zip(src_list, dest_list)):
new_rb = []
for b in rb:
if b.dtype == np.object_:
convert = i == src_len
result = b.replace(s, d, inplace=inplace, regex=regex,
mgr=mgr, convert=convert)
new_rb = _extend_blocks(result, new_rb)
else:
# get our mask for this element, sized to this
# particular block
m = masks[i][b.mgr_locs.indexer]
if m.any():
b = b.coerce_to_target_dtype(d)
new_rb.extend(b.putmask(m, d, inplace=True))
else:
new_rb.append(b)
rb = new_rb
result_blocks.extend(rb)
bm = self.__class__(result_blocks, self.axes)
bm._consolidate_inplace()
return bm
def reshape_nd(self, axes, **kwargs):
""" a 2d-nd reshape operation on a BlockManager """
return self.apply('reshape_nd', axes=axes, **kwargs)
def is_consolidated(self):
"""
Return True if more than one block with the same dtype
"""
if not self._known_consolidated:
self._consolidate_check()
return self._is_consolidated
def _consolidate_check(self):
ftypes = [blk.ftype for blk in self.blocks]
self._is_consolidated = len(ftypes) == len(set(ftypes))
self._known_consolidated = True
@property
def is_mixed_type(self):
# Warning, consolidation needs to get checked upstairs
self._consolidate_inplace()
return len(self.blocks) > 1
@property
def is_numeric_mixed_type(self):
# Warning, consolidation needs to get checked upstairs
self._consolidate_inplace()
return all(block.is_numeric for block in self.blocks)
@property
def is_datelike_mixed_type(self):
# Warning, consolidation needs to get checked upstairs
self._consolidate_inplace()
return any(block.is_datelike for block in self.blocks)
@property
def any_extension_types(self):
"""Whether any of the blocks in this manager are extension blocks"""
return any(block.is_extension for block in self.blocks)
@property
def is_view(self):
""" return a boolean if we are a single block and are a view """
if len(self.blocks) == 1:
return self.blocks[0].is_view
# It is technically possible to figure out which blocks are views
# e.g. [ b.values.base is not None for b in self.blocks ]
# but then we have the case of possibly some blocks being a view
# and some blocks not. setting in theory is possible on the non-view
# blocks w/o causing a SettingWithCopy raise/warn. But this is a bit
# complicated
return False
def get_bool_data(self, copy=False):
"""
Parameters
----------
copy : boolean, default False
Whether to copy the blocks
"""
self._consolidate_inplace()
return self.combine([b for b in self.blocks if b.is_bool], copy)
def get_numeric_data(self, copy=False):
"""
Parameters
----------
copy : boolean, default False
Whether to copy the blocks
"""
self._consolidate_inplace()
return self.combine([b for b in self.blocks if b.is_numeric], copy)
def combine(self, blocks, copy=True):
""" return a new manager with the blocks """
if len(blocks) == 0:
return self.make_empty()
# FIXME: optimization potential
indexer = np.sort(np.concatenate([b.mgr_locs.as_array
for b in blocks]))
inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0])
new_blocks = []
for b in blocks:
b = b.copy(deep=copy)
b.mgr_locs = algos.take_1d(inv_indexer, b.mgr_locs.as_array,
axis=0, allow_fill=False)
new_blocks.append(b)
axes = list(self.axes)
axes[0] = self.items.take(indexer)
return self.__class__(new_blocks, axes, do_integrity_check=False)
def get_slice(self, slobj, axis=0):
if axis >= self.ndim:
raise IndexError("Requested axis not found in manager")
if axis == 0:
new_blocks = self._slice_take_blocks_ax0(slobj)
else:
slicer = [slice(None)] * (axis + 1)
slicer[axis] = slobj
slicer = tuple(slicer)
new_blocks = [blk.getitem_block(slicer) for blk in self.blocks]
new_axes = list(self.axes)
new_axes[axis] = new_axes[axis][slobj]
bm = self.__class__(new_blocks, new_axes, do_integrity_check=False)
bm._consolidate_inplace()
return bm
def __contains__(self, item):
return item in self.items
@property
def nblocks(self):
return len(self.blocks)
def copy(self, deep=True, mgr=None):
"""
Make deep or shallow copy of BlockManager
Parameters
----------
deep : boolean o rstring, default True
If False, return shallow copy (do not copy data)
If 'all', copy data and a deep copy of the index
Returns
-------
copy : BlockManager
"""
# this preserves the notion of view copying of axes
if deep:
if deep == 'all':
copy = lambda ax: ax.copy(deep=True)
else:
copy = lambda ax: ax.view()
new_axes = [copy(ax) for ax in self.axes]
else:
new_axes = list(self.axes)
return self.apply('copy', axes=new_axes, deep=deep,
do_integrity_check=False)
def as_array(self, transpose=False, items=None):
"""Convert the blockmanager data into an numpy array.
Parameters
----------
transpose : boolean, default False
If True, transpose the return array
items : list of strings or None
Names of block items that will be included in the returned
array. ``None`` means that all block items will be used
Returns
-------
arr : ndarray
"""
if len(self.blocks) == 0:
arr = np.empty(self.shape, dtype=float)
return arr.transpose() if transpose else arr
if items is not None:
mgr = self.reindex_axis(items, axis=0)
else:
mgr = self
if self._is_single_block or not self.is_mixed_type:
arr = mgr.blocks[0].get_values()
else:
arr = mgr._interleave()
return arr.transpose() if transpose else arr
def _interleave(self):
"""
Return ndarray from blocks with specified item order
Items must be contained in the blocks
"""
dtype = _interleaved_dtype(self.blocks)
result = np.empty(self.shape, dtype=dtype)
if result.shape[0] == 0:
# Workaround for numpy 1.7 bug:
#
# >>> a = np.empty((0,10))
# >>> a[slice(0,0)]
# array([], shape=(0, 10), dtype=float64)
# >>> a[[]]
# Traceback (most recent call last):
# File "<stdin>", line 1, in <module>
# IndexError: index 0 is out of bounds for axis 0 with size 0
return result
itemmask = np.zeros(self.shape[0])
for blk in self.blocks:
rl = blk.mgr_locs
result[rl.indexer] = blk.get_values(dtype)
itemmask[rl.indexer] = 1
if not itemmask.all():
raise AssertionError('Some items were not contained in blocks')
return result
def to_dict(self, copy=True):
"""
Return a dict of str(dtype) -> BlockManager
Parameters
----------
copy : boolean, default True
Returns
-------
values : a dict of dtype -> BlockManager
Notes
-----
This consolidates based on str(dtype)
"""
self._consolidate_inplace()
bd = {}
for b in self.blocks:
bd.setdefault(str(b.dtype), []).append(b)
return {dtype: self.combine(blocks, copy=copy)
for dtype, blocks in bd.items()}
def xs(self, key, axis=1, copy=True, takeable=False):
if axis < 1:
raise AssertionError(
'Can only take xs across axis >= 1, got {ax}'.format(ax=axis))
# take by position
if takeable:
loc = key
else:
loc = self.axes[axis].get_loc(key)
slicer = [slice(None, None) for _ in range(self.ndim)]
slicer[axis] = loc
slicer = tuple(slicer)
new_axes = list(self.axes)
# could be an array indexer!
if isinstance(loc, (slice, np.ndarray)):
new_axes[axis] = new_axes[axis][loc]
else:
new_axes.pop(axis)
new_blocks = []
if len(self.blocks) > 1:
# we must copy here as we are mixed type
for blk in self.blocks:
newb = make_block(values=blk.values[slicer],
klass=blk.__class__,
placement=blk.mgr_locs)
new_blocks.append(newb)
elif len(self.blocks) == 1:
block = self.blocks[0]
vals = block.values[slicer]
if copy:
vals = vals.copy()
new_blocks = [make_block(values=vals,
placement=block.mgr_locs,
klass=block.__class__)]
return self.__class__(new_blocks, new_axes)
def fast_xs(self, loc):
"""
get a cross sectional for a given location in the
items ; handle dups
return the result, is *could* be a view in the case of a
single block
"""
if len(self.blocks) == 1:
return self.blocks[0].iget((slice(None), loc))
items = self.items
# non-unique (GH4726)
if not items.is_unique:
result = self._interleave()
if self.ndim == 2:
result = result.T
return result[loc]
# unique
dtype = _interleaved_dtype(self.blocks)
n = len(items)
result = np.empty(n, dtype=dtype)
for blk in self.blocks:
# Such assignment may incorrectly coerce NaT to None
# result[blk.mgr_locs] = blk._slice((slice(None), loc))
for i, rl in enumerate(blk.mgr_locs):
result[rl] = blk._try_coerce_result(blk.iget((i, loc)))
return result
def consolidate(self):
"""
Join together blocks having same dtype
Returns
-------
y : BlockManager
"""
if self.is_consolidated():
return self
bm = self.__class__(self.blocks, self.axes)
bm._is_consolidated = False
bm._consolidate_inplace()
return bm
def _consolidate_inplace(self):
if not self.is_consolidated():
self.blocks = tuple(_consolidate(self.blocks))
self._is_consolidated = True
self._known_consolidated = True
self._rebuild_blknos_and_blklocs()
def get(self, item, fastpath=True):
"""
Return values for selected item (ndarray or BlockManager).
"""
if self.items.is_unique:
if not isna(item):
loc = self.items.get_loc(item)
else:
indexer = np.arange(len(self.items))[isna(self.items)]
# allow a single nan location indexer
if not is_scalar(indexer):
if len(indexer) == 1:
loc = indexer.item()
else:
raise ValueError("cannot label index with a null key")
return self.iget(loc, fastpath=fastpath)
else:
if isna(item):
raise TypeError("cannot label index with a null key")
indexer = self.items.get_indexer_for([item])
return self.reindex_indexer(new_axis=self.items[indexer],
indexer=indexer, axis=0,
allow_dups=True)
def iget(self, i, fastpath=True):
"""
Return the data as a SingleBlockManager if fastpath=True and possible
Otherwise return as a ndarray
"""
block = self.blocks[self._blknos[i]]
values = block.iget(self._blklocs[i])
if not fastpath or not block._box_to_block_values or values.ndim != 1:
return values
# fastpath shortcut for select a single-dim from a 2-dim BM
return SingleBlockManager(
[block.make_block_same_class(values,
placement=slice(0, len(values)),
ndim=1)],
self.axes[1])
def get_scalar(self, tup):
"""
Retrieve single item
"""
full_loc = [ax.get_loc(x) for ax, x in zip(self.axes, tup)]
blk = self.blocks[self._blknos[full_loc[0]]]
values = blk.values
# FIXME: this may return non-upcasted types?
if values.ndim == 1:
return values[full_loc[1]]
full_loc[0] = self._blklocs[full_loc[0]]
return values[tuple(full_loc)]
def delete(self, item):
"""
Delete selected item (items if non-unique) in-place.
"""
indexer = self.items.get_loc(item)
is_deleted = np.zeros(self.shape[0], dtype=np.bool_)
is_deleted[indexer] = True
ref_loc_offset = -is_deleted.cumsum()
is_blk_deleted = [False] * len(self.blocks)
if isinstance(indexer, int):
affected_start = indexer
else:
affected_start = is_deleted.nonzero()[0][0]
for blkno, _ in _fast_count_smallints(self._blknos[affected_start:]):
blk = self.blocks[blkno]
bml = blk.mgr_locs
blk_del = is_deleted[bml.indexer].nonzero()[0]
if len(blk_del) == len(bml):
is_blk_deleted[blkno] = True
continue
elif len(blk_del) != 0:
blk.delete(blk_del)
bml = blk.mgr_locs
blk.mgr_locs = bml.add(ref_loc_offset[bml.indexer])
# FIXME: use Index.delete as soon as it uses fastpath=True
self.axes[0] = self.items[~is_deleted]
self.blocks = tuple(b for blkno, b in enumerate(self.blocks)
if not is_blk_deleted[blkno])
self._shape = None
self._rebuild_blknos_and_blklocs()
def set(self, item, value, check=False):
"""
Set new item in-place. Does not consolidate. Adds new Block if not
contained in the current set of items
if check, then validate that we are not setting the same data in-place
"""
# FIXME: refactor, clearly separate broadcasting & zip-like assignment
# can prob also fix the various if tests for sparse/categorical
# TODO(EA): Remove an is_extension_ when all extension types satisfy
# the interface
value_is_extension_type = (is_extension_type(value) or
is_extension_array_dtype(value))
# categorical/spares/datetimetz
if value_is_extension_type:
def value_getitem(placement):
return value
else:
if value.ndim == self.ndim - 1:
value = _safe_reshape(value, (1,) + value.shape)
def value_getitem(placement):
return value
else:
def value_getitem(placement):
return value[placement.indexer]
if value.shape[1:] != self.shape[1:]:
raise AssertionError('Shape of new values must be compatible '
'with manager shape')
try:
loc = self.items.get_loc(item)
except KeyError:
# This item wasn't present, just insert at end
self.insert(len(self.items), item, value)
return
if isinstance(loc, int):
loc = [loc]
blknos = self._blknos[loc]
blklocs = self._blklocs[loc].copy()
unfit_mgr_locs = []
unfit_val_locs = []
removed_blknos = []
for blkno, val_locs in _get_blkno_placements(blknos, len(self.blocks),
group=True):
blk = self.blocks[blkno]
blk_locs = blklocs[val_locs.indexer]
if blk.should_store(value):
blk.set(blk_locs, value_getitem(val_locs), check=check)
else:
unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
unfit_val_locs.append(val_locs)
# If all block items are unfit, schedule the block for removal.
if len(val_locs) == len(blk.mgr_locs):
removed_blknos.append(blkno)
else:
self._blklocs[blk.mgr_locs.indexer] = -1
blk.delete(blk_locs)
self._blklocs[blk.mgr_locs.indexer] = np.arange(len(blk))
if len(removed_blknos):
# Remove blocks & update blknos accordingly
is_deleted = np.zeros(self.nblocks, dtype=np.bool_)
is_deleted[removed_blknos] = True
new_blknos = np.empty(self.nblocks, dtype=np.int64)
new_blknos.fill(-1)
new_blknos[~is_deleted] = np.arange(self.nblocks -
len(removed_blknos))
self._blknos = algos.take_1d(new_blknos, self._blknos, axis=0,
allow_fill=False)
self.blocks = tuple(blk for i, blk in enumerate(self.blocks)
if i not in set(removed_blknos))
if unfit_val_locs:
unfit_mgr_locs = np.concatenate(unfit_mgr_locs)
unfit_count = len(unfit_mgr_locs)
new_blocks = []
if value_is_extension_type:
# This code (ab-)uses the fact that sparse blocks contain only
# one item.
new_blocks.extend(
make_block(values=value.copy(), ndim=self.ndim,
placement=slice(mgr_loc, mgr_loc + 1))
for mgr_loc in unfit_mgr_locs)
self._blknos[unfit_mgr_locs] = (np.arange(unfit_count) +
len(self.blocks))
self._blklocs[unfit_mgr_locs] = 0
else:
# unfit_val_locs contains BlockPlacement objects
unfit_val_items = unfit_val_locs[0].append(unfit_val_locs[1:])
new_blocks.append(
make_block(values=value_getitem(unfit_val_items),
ndim=self.ndim, placement=unfit_mgr_locs))
self._blknos[unfit_mgr_locs] = len(self.blocks)
self._blklocs[unfit_mgr_locs] = np.arange(unfit_count)
self.blocks += tuple(new_blocks)
# Newly created block's dtype may already be present.
self._known_consolidated = False
def insert(self, loc, item, value, allow_duplicates=False):
"""
Insert item at selected position.
Parameters
----------
loc : int
item : hashable
value : array_like
allow_duplicates: bool
If False, trying to insert non-unique item will raise
"""
if not allow_duplicates and item in self.items:
# Should this be a different kind of error??
raise ValueError('cannot insert {}, already exists'.format(item))
if not isinstance(loc, int):
raise TypeError("loc must be int")
# insert to the axis; this could possibly raise a TypeError
new_axis = self.items.insert(loc, item)
block = make_block(values=value, ndim=self.ndim,
placement=slice(loc, loc + 1))
for blkno, count in _fast_count_smallints(self._blknos[loc:]):
blk = self.blocks[blkno]
if count == len(blk.mgr_locs):
blk.mgr_locs = blk.mgr_locs.add(1)
else:
new_mgr_locs = blk.mgr_locs.as_array.copy()
new_mgr_locs[new_mgr_locs >= loc] += 1
blk.mgr_locs = new_mgr_locs
if loc == self._blklocs.shape[0]:
# np.append is a lot faster (at least in numpy 1.7.1), let's use it
# if we can.
self._blklocs = np.append(self._blklocs, 0)
self._blknos = np.append(self._blknos, len(self.blocks))
else:
self._blklocs = np.insert(self._blklocs, loc, 0)
self._blknos = np.insert(self._blknos, loc, len(self.blocks))
self.axes[0] = new_axis
self.blocks += (block,)
self._shape = None
self._known_consolidated = False
if len(self.blocks) > 100:
self._consolidate_inplace()
def reindex_axis(self, new_index, axis, method=None, limit=None,
fill_value=None, copy=True):
"""
Conform block manager to new index.
"""
new_index = _ensure_index(new_index)
new_index, indexer = self.axes[axis].reindex(new_index, method=method,
limit=limit)
return self.reindex_indexer(new_index, indexer, axis=axis,
fill_value=fill_value, copy=copy)
def reindex_indexer(self, new_axis, indexer, axis, fill_value=None,
allow_dups=False, copy=True):
"""
Parameters
----------
new_axis : Index
indexer : ndarray of int64 or None
axis : int
fill_value : object
allow_dups : bool
pandas-indexer with -1's only.
"""
if indexer is None:
if new_axis is self.axes[axis] and not copy:
return self
result = self.copy(deep=copy)
result.axes = list(self.axes)
result.axes[axis] = new_axis
return result
self._consolidate_inplace()
# some axes don't allow reindexing with dups
if not allow_dups:
self.axes[axis]._can_reindex(indexer)
if axis >= self.ndim:
raise IndexError("Requested axis not found in manager")
if axis == 0:
new_blocks = self._slice_take_blocks_ax0(indexer,
fill_tuple=(fill_value,))
else:
new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=(
fill_value if fill_value is not None else blk.fill_value,))
for blk in self.blocks]
new_axes = list(self.axes)
new_axes[axis] = new_axis
return self.__class__(new_blocks, new_axes)
def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None):
"""
Slice/take blocks along axis=0.
Overloaded for SingleBlock
Returns
-------
new_blocks : list of Block
"""
allow_fill = fill_tuple is not None
sl_type, slobj, sllen = _preprocess_slice_or_indexer(
slice_or_indexer, self.shape[0], allow_fill=allow_fill)
if self._is_single_block:
blk = self.blocks[0]
if sl_type in ('slice', 'mask'):
return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))]
elif not allow_fill or self.ndim == 1:
if allow_fill and fill_tuple[0] is None:
_, fill_value = maybe_promote(blk.dtype)
fill_tuple = (fill_value, )
return [blk.take_nd(slobj, axis=0,
new_mgr_locs=slice(0, sllen),
fill_tuple=fill_tuple)]
if sl_type in ('slice', 'mask'):
blknos = self._blknos[slobj]
blklocs = self._blklocs[slobj]
else:
blknos = algos.take_1d(self._blknos, slobj, fill_value=-1,
allow_fill=allow_fill)
blklocs = algos.take_1d(self._blklocs, slobj, fill_value=-1,
allow_fill=allow_fill)
# When filling blknos, make sure blknos is updated before appending to
# blocks list, that way new blkno is exactly len(blocks).
#
# FIXME: mgr_groupby_blknos must return mgr_locs in ascending order,
# pytables serialization will break otherwise.
blocks = []
for blkno, mgr_locs in _get_blkno_placements(blknos, len(self.blocks),
group=True):
if blkno == -1:
# If we've got here, fill_tuple was not None.
fill_value = fill_tuple[0]
blocks.append(self._make_na_block(placement=mgr_locs,
fill_value=fill_value))
else:
blk = self.blocks[blkno]
# Otherwise, slicing along items axis is necessary.
if not blk._can_consolidate:
# A non-consolidatable block, it's easy, because there's
# only one item and each mgr loc is a copy of that single
# item.
for mgr_loc in mgr_locs:
newblk = blk.copy(deep=True)
newblk.mgr_locs = slice(mgr_loc, mgr_loc + 1)
blocks.append(newblk)
else:
blocks.append(blk.take_nd(blklocs[mgr_locs.indexer],
axis=0, new_mgr_locs=mgr_locs,
fill_tuple=None))
return blocks
def _make_na_block(self, placement, fill_value=None):
# TODO: infer dtypes other than float64 from fill_value
if fill_value is None:
fill_value = np.nan
block_shape = list(self.shape)
block_shape[0] = len(placement)
dtype, fill_value = infer_dtype_from_scalar(fill_value)
block_values = np.empty(block_shape, dtype=dtype)
block_values.fill(fill_value)
return make_block(block_values, placement=placement)
def take(self, indexer, axis=1, verify=True, convert=True):
"""
Take items along any axis.
"""
self._consolidate_inplace()
indexer = (np.arange(indexer.start, indexer.stop, indexer.step,
dtype='int64')
if isinstance(indexer, slice)
else np.asanyarray(indexer, dtype='int64'))
n = self.shape[axis]
if convert:
indexer = maybe_convert_indices(indexer, n)
if verify:
if ((indexer == -1) | (indexer >= n)).any():
raise Exception('Indices must be nonzero and less than '
'the axis length')
new_labels = self.axes[axis].take(indexer)
return self.reindex_indexer(new_axis=new_labels, indexer=indexer,
axis=axis, allow_dups=True)
def merge(self, other, lsuffix='', rsuffix=''):
if not self._is_indexed_like(other):
raise AssertionError('Must have same axes to merge managers')
l, r = items_overlap_with_suffix(left=self.items, lsuffix=lsuffix,
right=other.items, rsuffix=rsuffix)
new_items = _concat_indexes([l, r])
new_blocks = [blk.copy(deep=False) for blk in self.blocks]
offset = self.shape[0]
for blk in other.blocks:
blk = blk.copy(deep=False)
blk.mgr_locs = blk.mgr_locs.add(offset)
new_blocks.append(blk)
new_axes = list(self.axes)
new_axes[0] = new_items
return self.__class__(_consolidate(new_blocks), new_axes)
def _is_indexed_like(self, other):
"""
Check all axes except items
"""
if self.ndim != other.ndim:
raise AssertionError(
'Number of dimensions must agree got {ndim} and '
'{oth_ndim}'.format(ndim=self.ndim, oth_ndim=other.ndim))
for ax, oax in zip(self.axes[1:], other.axes[1:]):
if not ax.equals(oax):
return False
return True
def equals(self, other):
self_axes, other_axes = self.axes, other.axes
if len(self_axes) != len(other_axes):
return False
if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)):
return False
self._consolidate_inplace()
other._consolidate_inplace()
if len(self.blocks) != len(other.blocks):
return False
# canonicalize block order, using a tuple combining the type
# name and then mgr_locs because there might be unconsolidated
# blocks (say, Categorical) which can only be distinguished by
# the iteration order
def canonicalize(block):
return (block.dtype.name, block.mgr_locs.as_array.tolist())
self_blocks = sorted(self.blocks, key=canonicalize)
other_blocks = sorted(other.blocks, key=canonicalize)
return all(block.equals(oblock)
for block, oblock in zip(self_blocks, other_blocks))
def unstack(self, unstacker_func):
"""Return a blockmanager with all blocks unstacked.
Parameters
----------
unstacker_func : callable
A (partially-applied) ``pd.core.reshape._Unstacker`` class.
Returns
-------
unstacked : BlockManager
"""
dummy = unstacker_func(np.empty((0, 0)), value_columns=self.items)
new_columns = dummy.get_new_columns()
new_index = dummy.get_new_index()
new_blocks = []
columns_mask = []
for blk in self.blocks:
blocks, mask = blk._unstack(
partial(unstacker_func,
value_columns=self.items[blk.mgr_locs.indexer]),
new_columns)
new_blocks.extend(blocks)
columns_mask.extend(mask)
new_columns = new_columns[columns_mask]
bm = BlockManager(new_blocks, [new_columns, new_index])
return bm
class SingleBlockManager(BlockManager):
""" manage a single block with """
ndim = 1
_is_consolidated = True
_known_consolidated = True
__slots__ = ()
def __init__(self, block, axis, do_integrity_check=False, fastpath=False):
if isinstance(axis, list):
if len(axis) != 1:
raise ValueError("cannot create SingleBlockManager with more "
"than 1 axis")
axis = axis[0]
# passed from constructor, single block, single axis
if fastpath:
self.axes = [axis]
if isinstance(block, list):
# empty block
if len(block) == 0:
block = [np.array([])]
elif len(block) != 1:
raise ValueError('Cannot create SingleBlockManager with '
'more than 1 block')
block = block[0]
else:
self.axes = [_ensure_index(axis)]
# create the block here
if isinstance(block, list):
# provide consolidation to the interleaved_dtype
if len(block) > 1:
dtype = _interleaved_dtype(block)
block = [b.astype(dtype) for b in block]
block = _consolidate(block)
if len(block) != 1:
raise ValueError('Cannot create SingleBlockManager with '
'more than 1 block')
block = block[0]
if not isinstance(block, Block):
block = make_block(block, placement=slice(0, len(axis)), ndim=1)
self.blocks = [block]
def _post_setstate(self):
pass
@property
def _block(self):
return self.blocks[0]
@property
def _values(self):
return self._block.values
@property
def _blknos(self):
""" compat with BlockManager """
return None
@property
def _blklocs(self):
""" compat with BlockManager """
return None
def get_slice(self, slobj, axis=0):
if axis >= self.ndim:
raise IndexError("Requested axis not found in manager")
return self.__class__(self._block._slice(slobj),
self.index[slobj], fastpath=True)
@property
def index(self):
return self.axes[0]
def convert(self, **kwargs):
""" convert the whole block as one """
kwargs['by_item'] = False
return self.apply('convert', **kwargs)
@property
def dtype(self):
return self._block.dtype
@property
def array_dtype(self):
return self._block.array_dtype
@property
def ftype(self):
return self._block.ftype
def get_dtype_counts(self):
return {self.dtype.name: 1}
def get_ftype_counts(self):
return {self.ftype: 1}
def get_dtypes(self):
return np.array([self._block.dtype])
def get_ftypes(self):
return np.array([self._block.ftype])
def external_values(self):
return self._block.external_values()
def internal_values(self):
return self._block.internal_values()
def formatting_values(self):
"""Return the internal values used by the DataFrame/SeriesFormatter"""
return self._block.formatting_values()
def get_values(self):
""" return a dense type view """
return np.array(self._block.to_dense(), copy=False)
@property
def asobject(self):
"""
return a object dtype array. datetime/timedelta like values are boxed
to Timestamp/Timedelta instances.
"""
return self._block.get_values(dtype=object)
@property
def _can_hold_na(self):
return self._block._can_hold_na
def is_consolidated(self):
return True
def _consolidate_check(self):
pass
def _consolidate_inplace(self):
pass
def delete(self, item):
"""
Delete single item from SingleBlockManager.
Ensures that self.blocks doesn't become empty.
"""
loc = self.items.get_loc(item)
self._block.delete(loc)
self.axes[0] = self.axes[0].delete(loc)
def fast_xs(self, loc):
"""
fast path for getting a cross-section
return a view of the data
"""
return self._block.values[loc]
def concat(self, to_concat, new_axis):
"""
Concatenate a list of SingleBlockManagers into a single
SingleBlockManager.
Used for pd.concat of Series objects with axis=0.
Parameters
----------
to_concat : list of SingleBlockManagers
new_axis : Index of the result
Returns
-------
SingleBlockManager
"""
non_empties = [x for x in to_concat if len(x) > 0]
# check if all series are of the same block type:
if len(non_empties) > 0:
blocks = [obj.blocks[0] for obj in non_empties]
if all(type(b) is type(blocks[0]) for b in blocks[1:]): # noqa
new_block = blocks[0].concat_same_type(blocks)
else:
values = [x.values for x in blocks]
values = _concat._concat_compat(values)
new_block = make_block(
values, placement=slice(0, len(values), 1))
else:
values = [x._block.values for x in to_concat]
values = _concat._concat_compat(values)
new_block = make_block(
values, placement=slice(0, len(values), 1))
mgr = SingleBlockManager(new_block, new_axis)
return mgr
def construction_error(tot_items, block_shape, axes, e=None):
""" raise a helpful message about our construction """
passed = tuple(map(int, [tot_items] + list(block_shape)))
implied = tuple(map(int, [len(ax) for ax in axes]))
if passed == implied and e is not None:
raise e
if block_shape[0] == 0:
raise ValueError("Empty data passed with indices specified.")
raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
passed, implied))
def create_block_manager_from_blocks(blocks, axes):
try:
if len(blocks) == 1 and not isinstance(blocks[0], Block):
# if blocks[0] is of length 0, return empty blocks
if not len(blocks[0]):
blocks = []
else:
# It's OK if a single block is passed as values, its placement
# is basically "all items", but if there're many, don't bother
# converting, it's an error anyway.
blocks = [make_block(values=blocks[0],
placement=slice(0, len(axes[0])))]
mgr = BlockManager(blocks, axes)
mgr._consolidate_inplace()
return mgr
except (ValueError) as e:
blocks = [getattr(b, 'values', b) for b in blocks]
tot_items = sum(b.shape[0] for b in blocks)
construction_error(tot_items, blocks[0].shape[1:], axes, e)
def create_block_manager_from_arrays(arrays, names, axes):
try:
blocks = form_blocks(arrays, names, axes)
mgr = BlockManager(blocks, axes)
mgr._consolidate_inplace()
return mgr
except ValueError as e:
construction_error(len(arrays), arrays[0].shape, axes, e)
def form_blocks(arrays, names, axes):
# put "leftover" items in float bucket, where else?
# generalize?
items_dict = defaultdict(list)
extra_locs = []
names_idx = _ensure_index(names)
if names_idx.equals(axes[0]):
names_indexer = np.arange(len(names_idx))
else:
assert names_idx.intersection(axes[0]).is_unique
names_indexer = names_idx.get_indexer_for(axes[0])
for i, name_idx in enumerate(names_indexer):
if name_idx == -1:
extra_locs.append(i)
continue
k = names[name_idx]
v = arrays[name_idx]
block_type = get_block_type(v)
items_dict[block_type.__name__].append((i, k, v))
blocks = []
if len(items_dict['FloatBlock']):
float_blocks = _multi_blockify(items_dict['FloatBlock'])
blocks.extend(float_blocks)
if len(items_dict['ComplexBlock']):
complex_blocks = _multi_blockify(items_dict['ComplexBlock'])
blocks.extend(complex_blocks)
if len(items_dict['TimeDeltaBlock']):
timedelta_blocks = _multi_blockify(items_dict['TimeDeltaBlock'])
blocks.extend(timedelta_blocks)
if len(items_dict['IntBlock']):
int_blocks = _multi_blockify(items_dict['IntBlock'])
blocks.extend(int_blocks)
if len(items_dict['DatetimeBlock']):
datetime_blocks = _simple_blockify(items_dict['DatetimeBlock'],
_NS_DTYPE)
blocks.extend(datetime_blocks)
if len(items_dict['DatetimeTZBlock']):
dttz_blocks = [make_block(array,
klass=DatetimeTZBlock,
placement=[i])
for i, _, array in items_dict['DatetimeTZBlock']]
blocks.extend(dttz_blocks)
if len(items_dict['BoolBlock']):
bool_blocks = _simple_blockify(items_dict['BoolBlock'], np.bool_)
blocks.extend(bool_blocks)
if len(items_dict['ObjectBlock']) > 0:
object_blocks = _simple_blockify(items_dict['ObjectBlock'], np.object_)
blocks.extend(object_blocks)
if len(items_dict['SparseBlock']) > 0:
sparse_blocks = _sparse_blockify(items_dict['SparseBlock'])
blocks.extend(sparse_blocks)
if len(items_dict['CategoricalBlock']) > 0:
cat_blocks = [make_block(array, klass=CategoricalBlock, placement=[i])
for i, _, array in items_dict['CategoricalBlock']]
blocks.extend(cat_blocks)
if len(items_dict['ExtensionBlock']):
external_blocks = [
make_block(array, klass=ExtensionBlock, placement=[i])
for i, _, array in items_dict['ExtensionBlock']
]
blocks.extend(external_blocks)
if len(extra_locs):
shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:])
# empty items -> dtype object
block_values = np.empty(shape, dtype=object)
block_values.fill(np.nan)
na_block = make_block(block_values, placement=extra_locs)
blocks.append(na_block)
return blocks
def _simple_blockify(tuples, dtype):
""" return a single array of a block that has a single dtype; if dtype is
not None, coerce to this dtype
"""
values, placement = _stack_arrays(tuples, dtype)
# CHECK DTYPE?
if dtype is not None and values.dtype != dtype: # pragma: no cover
values = values.astype(dtype)
block = make_block(values, placement=placement)
return [block]
def _multi_blockify(tuples, dtype=None):
""" return an array of blocks that potentially have different dtypes """
# group by dtype
grouper = itertools.groupby(tuples, lambda x: x[2].dtype)
new_blocks = []
for dtype, tup_block in grouper:
values, placement = _stack_arrays(list(tup_block), dtype)
block = make_block(values, placement=placement)
new_blocks.append(block)
return new_blocks
def _sparse_blockify(tuples, dtype=None):
""" return an array of blocks that potentially have different dtypes (and
are sparse)
"""
new_blocks = []
for i, names, array in tuples:
array = _maybe_to_sparse(array)
block = make_block(array, klass=SparseBlock, placement=[i])
new_blocks.append(block)
return new_blocks
def _stack_arrays(tuples, dtype):
# fml
def _asarray_compat(x):
if isinstance(x, ABCSeries):
return x._values
else:
return np.asarray(x)
def _shape_compat(x):
if isinstance(x, ABCSeries):
return len(x),
else:
return x.shape
placement, names, arrays = zip(*tuples)
first = arrays[0]
shape = (len(arrays),) + _shape_compat(first)
stacked = np.empty(shape, dtype=dtype)
for i, arr in enumerate(arrays):
stacked[i] = _asarray_compat(arr)
return stacked, placement
def _interleaved_dtype(blocks):
if not len(blocks):
return None
dtype = find_common_type([b.dtype for b in blocks])
# only numpy compat
if isinstance(dtype, (PandasExtensionDtype, ExtensionDtype)):
dtype = np.object
return dtype
def _consolidate(blocks):
"""
Merge blocks having same dtype, exclude non-consolidating blocks
"""
# sort by _can_consolidate, dtype
gkey = lambda x: x._consolidate_key
grouper = itertools.groupby(sorted(blocks, key=gkey), gkey)
new_blocks = []
for (_can_consolidate, dtype), group_blocks in grouper:
merged_blocks = _merge_blocks(list(group_blocks), dtype=dtype,
_can_consolidate=_can_consolidate)
new_blocks = _extend_blocks(merged_blocks, new_blocks)
return new_blocks
def _merge_blocks(blocks, dtype=None, _can_consolidate=True):
if len(blocks) == 1:
return blocks[0]
if _can_consolidate:
if dtype is None:
if len({b.dtype for b in blocks}) != 1:
raise AssertionError("_merge_blocks are invalid!")
dtype = blocks[0].dtype
# FIXME: optimization potential in case all mgrs contain slices and
# combination of those slices is a slice, too.
new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks])
new_values = _vstack([b.values for b in blocks], dtype)
argsort = np.argsort(new_mgr_locs)
new_values = new_values[argsort]
new_mgr_locs = new_mgr_locs[argsort]
return make_block(new_values, placement=new_mgr_locs)
# no merge
return blocks
def _extend_blocks(result, blocks=None):
""" return a new extended blocks, givin the result """
if blocks is None:
blocks = []
if isinstance(result, list):
for r in result:
if isinstance(r, list):
blocks.extend(r)
else:
blocks.append(r)
elif isinstance(result, BlockManager):
blocks.extend(result.blocks)
else:
blocks.append(result)
return blocks
def _block_shape(values, ndim=1, shape=None):
""" guarantee the shape of the values to be at least 1 d """
if values.ndim < ndim:
if shape is None:
shape = values.shape
values = values.reshape(tuple((1, ) + shape))
return values
def _vstack(to_stack, dtype):
# work around NumPy 1.6 bug
if dtype == _NS_DTYPE or dtype == _TD_DTYPE:
new_values = np.vstack([x.view('i8') for x in to_stack])
return new_values.view(dtype)
else:
return np.vstack(to_stack)
def _maybe_compare(a, b, op):
is_a_array = isinstance(a, np.ndarray)
is_b_array = isinstance(b, np.ndarray)
# numpy deprecation warning to have i8 vs integer comparisons
if is_datetimelike_v_numeric(a, b):
result = False
# numpy deprecation warning if comparing numeric vs string-like
elif is_numeric_v_string_like(a, b):
result = False
else:
result = op(a, b)
if is_scalar(result) and (is_a_array or is_b_array):
type_names = [type(a).__name__, type(b).__name__]
if is_a_array:
type_names[0] = 'ndarray(dtype={dtype})'.format(dtype=a.dtype)
if is_b_array:
type_names[1] = 'ndarray(dtype={dtype})'.format(dtype=b.dtype)
raise TypeError(
"Cannot compare types {a!r} and {b!r}".format(a=type_names[0],
b=type_names[1]))
return result
def _concat_indexes(indexes):
return indexes[0].append(indexes[1:])
def _block2d_to_blocknd(values, placement, shape, labels, ref_items):
""" pivot to the labels shape """
panel_shape = (len(placement),) + shape
# TODO: lexsort depth needs to be 2!!
# Create observation selection vector using major and minor
# labels, for converting to panel format.
selector = _factor_indexer(shape[1:], labels)
mask = np.zeros(np.prod(shape), dtype=bool)
mask.put(selector, True)
if mask.all():
pvalues = np.empty(panel_shape, dtype=values.dtype)
else:
dtype, fill_value = maybe_promote(values.dtype)
pvalues = np.empty(panel_shape, dtype=dtype)
pvalues.fill(fill_value)
for i in range(len(placement)):
pvalues[i].flat[mask] = values[:, i]
return make_block(pvalues, placement=placement)
def _factor_indexer(shape, labels):
"""
given a tuple of shape and a list of Categorical labels, return the
expanded label indexer
"""
mult = np.array(shape)[::-1].cumprod()[::-1]
return _ensure_platform_int(
np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T)
def _get_blkno_placements(blknos, blk_count, group=True):
"""
Parameters
----------
blknos : array of int64
blk_count : int
group : bool
Returns
-------
iterator
yield (BlockPlacement, blkno)
"""
blknos = _ensure_int64(blknos)
# FIXME: blk_count is unused, but it may avoid the use of dicts in cython
for blkno, indexer in libinternals.get_blkno_indexers(blknos, group):
yield blkno, BlockPlacement(indexer)
def items_overlap_with_suffix(left, lsuffix, right, rsuffix):
"""
If two indices overlap, add suffixes to overlapping entries.
If corresponding suffix is empty, the entry is simply converted to string.
"""
to_rename = left.intersection(right)
if len(to_rename) == 0:
return left, right
else:
if not lsuffix and not rsuffix:
raise ValueError('columns overlap but no suffix specified: '
'{rename}'.format(rename=to_rename))
def lrenamer(x):
if x in to_rename:
return '{x}{lsuffix}'.format(x=x, lsuffix=lsuffix)
return x
def rrenamer(x):
if x in to_rename:
return '{x}{rsuffix}'.format(x=x, rsuffix=rsuffix)
return x
return (_transform_index(left, lrenamer),
_transform_index(right, rrenamer))
def _safe_reshape(arr, new_shape):
"""
If possible, reshape `arr` to have shape `new_shape`,
with a couple of exceptions (see gh-13012):
1) If `arr` is a ExtensionArray or Index, `arr` will be
returned as is.
2) If `arr` is a Series, the `_values` attribute will
be reshaped and returned.
Parameters
----------
arr : array-like, object to be reshaped
new_shape : int or tuple of ints, the new shape
"""
if isinstance(arr, ABCSeries):
arr = arr._values
if not isinstance(arr, ABCExtensionArray):
arr = arr.reshape(new_shape)
return arr
def _transform_index(index, func, level=None):
"""
Apply function to all values found in index.
This includes transforming multiindex entries separately.
Only apply function to one level of the MultiIndex if level is specified.
"""
if isinstance(index, MultiIndex):
if level is not None:
items = [tuple(func(y) if i == level else y
for i, y in enumerate(x)) for x in index]
else:
items = [tuple(func(y) for y in x) for x in index]
return MultiIndex.from_tuples(items, names=index.names)
else:
items = [func(x) for x in index]
return Index(items, name=index.name, tupleize_cols=False)
def _putmask_smart(v, m, n):
"""
Return a new ndarray, try to preserve dtype if possible.
Parameters
----------
v : `values`, updated in-place (array like)
m : `mask`, applies to both sides (array like)
n : `new values` either scalar or an array like aligned with `values`
Returns
-------
values : ndarray with updated values
this *may* be a copy of the original
See Also
--------
ndarray.putmask
"""
# we cannot use np.asarray() here as we cannot have conversions
# that numpy does when numeric are mixed with strings
# n should be the length of the mask or a scalar here
if not is_list_like(n):
n = np.repeat(n, len(m))
elif isinstance(n, np.ndarray) and n.ndim == 0: # numpy scalar
n = np.repeat(np.array(n, ndmin=1), len(m))
# see if we are only masking values that if putted
# will work in the current dtype
try:
nn = n[m]
# make sure that we have a nullable type
# if we have nulls
if not _isna_compat(v, nn[0]):
raise ValueError
# we ignore ComplexWarning here
with catch_warnings(record=True):
nn_at = nn.astype(v.dtype)
# avoid invalid dtype comparisons
# between numbers & strings
# only compare integers/floats
# don't compare integers to datetimelikes
if (not is_numeric_v_string_like(nn, nn_at) and
(is_float_dtype(nn.dtype) or
is_integer_dtype(nn.dtype) and
is_float_dtype(nn_at.dtype) or
is_integer_dtype(nn_at.dtype))):
comp = (nn == nn_at)
if is_list_like(comp) and comp.all():
nv = v.copy()
nv[m] = nn_at
return nv
except (ValueError, IndexError, TypeError):
pass
n = np.asarray(n)
def _putmask_preserve(nv, n):
try:
nv[m] = n[m]
except (IndexError, ValueError):
nv[m] = n
return nv
# preserves dtype if possible
if v.dtype.kind == n.dtype.kind:
return _putmask_preserve(v, n)
# change the dtype if needed
dtype, _ = maybe_promote(n.dtype)
if is_extension_type(v.dtype) and is_object_dtype(dtype):
v = v.get_values(dtype)
else:
v = v.astype(dtype)
return _putmask_preserve(v, n)
def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy):
"""
Concatenate block managers into one.
Parameters
----------
mgrs_indexers : list of (BlockManager, {axis: indexer,...}) tuples
axes : list of Index
concat_axis : int
copy : bool
"""
concat_plan = combine_concat_plans(
[get_mgr_concatenation_plan(mgr, indexers)
for mgr, indexers in mgrs_indexers], concat_axis)
blocks = []
for placement, join_units in concat_plan:
if len(join_units) == 1 and not join_units[0].indexers:
b = join_units[0].block
values = b.values
if copy:
values = values.copy()
elif not copy:
values = values.view()
b = b.make_block_same_class(values, placement=placement)
elif is_uniform_join_units(join_units):
b = join_units[0].block.concat_same_type(
[ju.block for ju in join_units], placement=placement)
else:
b = make_block(
concatenate_join_units(join_units, concat_axis, copy=copy),
placement=placement)
blocks.append(b)
return BlockManager(blocks, axes)
def is_uniform_join_units(join_units):
"""
Check if the join units consist of blocks of uniform type that can
be concatenated using Block.concat_same_type instead of the generic
concatenate_join_units (which uses `_concat._concat_compat`).
"""
return (
# all blocks need to have the same type
all(type(ju.block) is type(join_units[0].block) for ju in join_units) and # noqa
# no blocks that would get missing values (can lead to type upcasts)
# unless we're an extension dtype.
all(not ju.is_na or ju.block.is_extension for ju in join_units) and
# no blocks with indexers (as then the dimensions do not fit)
all(not ju.indexers for ju in join_units) and
# disregard Panels
all(ju.block.ndim <= 2 for ju in join_units) and
# only use this path when there is something to concatenate
len(join_units) > 1)
def is_uniform_reindex(join_units):
return (
# TODO: should this be ju.block._can_hold_na?
all(ju.block and ju.block.is_extension for ju in join_units) and
len(set(ju.block.dtype.name for ju in join_units)) == 1
)
def get_empty_dtype_and_na(join_units):
"""
Return dtype and N/A values to use when concatenating specified units.
Returned N/A value may be None which means there was no casting involved.
Returns
-------
dtype
na
"""
if len(join_units) == 1:
blk = join_units[0].block
if blk is None:
return np.float64, np.nan
if is_uniform_reindex(join_units):
# XXX: integrate property
empty_dtype = join_units[0].block.dtype
upcasted_na = join_units[0].block.fill_value
return empty_dtype, upcasted_na
has_none_blocks = False
dtypes = [None] * len(join_units)
for i, unit in enumerate(join_units):
if unit.block is None:
has_none_blocks = True
else:
dtypes[i] = unit.dtype
upcast_classes = defaultdict(list)
null_upcast_classes = defaultdict(list)
for dtype, unit in zip(dtypes, join_units):
if dtype is None:
continue
if is_categorical_dtype(dtype):
upcast_cls = 'category'
elif is_datetimetz(dtype):
upcast_cls = 'datetimetz'
elif issubclass(dtype.type, np.bool_):
upcast_cls = 'bool'
elif issubclass(dtype.type, np.object_):
upcast_cls = 'object'
elif is_datetime64_dtype(dtype):
upcast_cls = 'datetime'
elif is_timedelta64_dtype(dtype):
upcast_cls = 'timedelta'
elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
upcast_cls = dtype.name
else:
upcast_cls = 'float'
# Null blocks should not influence upcast class selection, unless there
# are only null blocks, when same upcasting rules must be applied to
# null upcast classes.
if unit.is_na:
null_upcast_classes[upcast_cls].append(dtype)
else:
upcast_classes[upcast_cls].append(dtype)
if not upcast_classes:
upcast_classes = null_upcast_classes
# create the result
if 'object' in upcast_classes:
return np.dtype(np.object_), np.nan
elif 'bool' in upcast_classes:
if has_none_blocks:
return np.dtype(np.object_), np.nan
else:
return np.dtype(np.bool_), None
elif 'category' in upcast_classes:
return np.dtype(np.object_), np.nan
elif 'datetimetz' in upcast_classes:
dtype = upcast_classes['datetimetz']
return dtype[0], tslib.iNaT
elif 'datetime' in upcast_classes:
return np.dtype('M8[ns]'), tslib.iNaT
elif 'timedelta' in upcast_classes:
return np.dtype('m8[ns]'), tslib.iNaT
else: # pragma
g = np.find_common_type(upcast_classes, [])
if is_float_dtype(g):
return g, g.type(np.nan)
elif is_numeric_dtype(g):
if has_none_blocks:
return np.float64, np.nan
else:
return g, None
msg = "invalid dtype determination in get_concat_dtype"
raise AssertionError(msg)
def concatenate_join_units(join_units, concat_axis, copy):
"""
Concatenate values from several join units along selected axis.
"""
if concat_axis == 0 and len(join_units) > 1:
# Concatenating join units along ax0 is handled in _merge_blocks.
raise AssertionError("Concatenating join units along axis0")
empty_dtype, upcasted_na = get_empty_dtype_and_na(join_units)
to_concat = [ju.get_reindexed_values(empty_dtype=empty_dtype,
upcasted_na=upcasted_na)
for ju in join_units]
if len(to_concat) == 1:
# Only one block, nothing to concatenate.
concat_values = to_concat[0]
if copy:
if isinstance(concat_values, np.ndarray):
# non-reindexed (=not yet copied) arrays are made into a view
# in JoinUnit.get_reindexed_values
if concat_values.base is not None:
concat_values = concat_values.copy()
else:
concat_values = concat_values.copy()
else:
concat_values = _concat._concat_compat(to_concat, axis=concat_axis)
return concat_values
def get_mgr_concatenation_plan(mgr, indexers):
"""
Construct concatenation plan for given block manager and indexers.
Parameters
----------
mgr : BlockManager
indexers : dict of {axis: indexer}
Returns
-------
plan : list of (BlockPlacement, JoinUnit) tuples
"""
# Calculate post-reindex shape , save for item axis which will be separate
# for each block anyway.
mgr_shape = list(mgr.shape)
for ax, indexer in indexers.items():
mgr_shape[ax] = len(indexer)
mgr_shape = tuple(mgr_shape)
if 0 in indexers:
ax0_indexer = indexers.pop(0)
blknos = algos.take_1d(mgr._blknos, ax0_indexer, fill_value=-1)
blklocs = algos.take_1d(mgr._blklocs, ax0_indexer, fill_value=-1)
else:
if mgr._is_single_block:
blk = mgr.blocks[0]
return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))]
ax0_indexer = None
blknos = mgr._blknos
blklocs = mgr._blklocs
plan = []
for blkno, placements in _get_blkno_placements(blknos, len(mgr.blocks),
group=False):
assert placements.is_slice_like
join_unit_indexers = indexers.copy()
shape = list(mgr_shape)
shape[0] = len(placements)
shape = tuple(shape)
if blkno == -1:
unit = JoinUnit(None, shape)
else:
blk = mgr.blocks[blkno]
ax0_blk_indexer = blklocs[placements.indexer]
unit_no_ax0_reindexing = (len(placements) == len(blk.mgr_locs) and
# Fastpath detection of join unit not
# needing to reindex its block: no ax0
# reindexing took place and block
# placement was sequential before.
((ax0_indexer is None and
blk.mgr_locs.is_slice_like and
blk.mgr_locs.as_slice.step == 1) or
# Slow-ish detection: all indexer locs
# are sequential (and length match is
# checked above).
(np.diff(ax0_blk_indexer) == 1).all()))
# Omit indexer if no item reindexing is required.
if unit_no_ax0_reindexing:
join_unit_indexers.pop(0, None)
else:
join_unit_indexers[0] = ax0_blk_indexer
unit = JoinUnit(blk, shape, join_unit_indexers)
plan.append((placements, unit))
return plan
def combine_concat_plans(plans, concat_axis):
"""
Combine multiple concatenation plans into one.
existing_plan is updated in-place.
"""
if len(plans) == 1:
for p in plans[0]:
yield p[0], [p[1]]
elif concat_axis == 0:
offset = 0
for plan in plans:
last_plc = None
for plc, unit in plan:
yield plc.add(offset), [unit]
last_plc = plc
if last_plc is not None:
offset += last_plc.as_slice.stop
else:
num_ended = [0]
def _next_or_none(seq):
retval = next(seq, None)
if retval is None:
num_ended[0] += 1
return retval
plans = list(map(iter, plans))
next_items = list(map(_next_or_none, plans))
while num_ended[0] != len(next_items):
if num_ended[0] > 0:
raise ValueError("Plan shapes are not aligned")
placements, units = zip(*next_items)
lengths = list(map(len, placements))
min_len, max_len = min(lengths), max(lengths)
if min_len == max_len:
yield placements[0], units
next_items[:] = map(_next_or_none, plans)
else:
yielded_placement = None
yielded_units = [None] * len(next_items)
for i, (plc, unit) in enumerate(next_items):
yielded_units[i] = unit
if len(plc) > min_len:
# trim_join_unit updates unit in place, so only
# placement needs to be sliced to skip min_len.
next_items[i] = (plc[min_len:],
trim_join_unit(unit, min_len))
else:
yielded_placement = plc
next_items[i] = _next_or_none(plans[i])
yield yielded_placement, yielded_units
def trim_join_unit(join_unit, length):
"""
Reduce join_unit's shape along item axis to length.
Extra items that didn't fit are returned as a separate block.
"""
if 0 not in join_unit.indexers:
extra_indexers = join_unit.indexers
if join_unit.block is None:
extra_block = None
else:
extra_block = join_unit.block.getitem_block(slice(length, None))
join_unit.block = join_unit.block.getitem_block(slice(length))
else:
extra_block = join_unit.block
extra_indexers = copy.copy(join_unit.indexers)
extra_indexers[0] = extra_indexers[0][length:]
join_unit.indexers[0] = join_unit.indexers[0][:length]
extra_shape = (join_unit.shape[0] - length,) + join_unit.shape[1:]
join_unit.shape = (length,) + join_unit.shape[1:]
return JoinUnit(block=extra_block, indexers=extra_indexers,
shape=extra_shape)
class JoinUnit(object):
def __init__(self, block, shape, indexers=None):
# Passing shape explicitly is required for cases when block is None.
if indexers is None:
indexers = {}
self.block = block
self.indexers = indexers
self.shape = shape
def __repr__(self):
return '{name}({block!r}, {indexers})'.format(
name=self.__class__.__name__, block=self.block,
indexers=self.indexers)
@cache_readonly
def needs_filling(self):
for indexer in self.indexers.values():
# FIXME: cache results of indexer == -1 checks.
if (indexer == -1).any():
return True
return False
@cache_readonly
def dtype(self):
if self.block is None:
raise AssertionError("Block is None, no dtype")
if not self.needs_filling:
return self.block.dtype
else:
return _get_dtype(maybe_promote(self.block.dtype,
self.block.fill_value)[0])
@cache_readonly
def is_na(self):
if self.block is None:
return True
if not self.block._can_hold_na:
return False
# Usually it's enough to check but a small fraction of values to see if
# a block is NOT null, chunks should help in such cases. 1000 value
# was chosen rather arbitrarily.
values = self.block.values
if self.block.is_categorical:
values_flat = values.categories
elif self.block.is_sparse:
# fill_value is not NaN and have holes
if not values._null_fill_value and values.sp_index.ngaps > 0:
return False
values_flat = values.ravel(order='K')
elif isinstance(self.block, ExtensionBlock):
values_flat = values
else:
values_flat = values.ravel(order='K')
total_len = values_flat.shape[0]
chunk_len = max(total_len // 40, 1000)
for i in range(0, total_len, chunk_len):
if not isna(values_flat[i:i + chunk_len]).all():
return False
return True
def get_reindexed_values(self, empty_dtype, upcasted_na):
if upcasted_na is None:
# No upcasting is necessary
fill_value = self.block.fill_value
values = self.block.get_values()
else:
fill_value = upcasted_na
if self.is_na:
if getattr(self.block, 'is_object', False):
# we want to avoid filling with np.nan if we are
# using None; we already know that we are all
# nulls
values = self.block.values.ravel(order='K')
if len(values) and values[0] is None:
fill_value = None
if getattr(self.block, 'is_datetimetz', False) or \
is_datetimetz(empty_dtype):
pass
elif getattr(self.block, 'is_categorical', False):
pass
elif getattr(self.block, 'is_sparse', False):
pass
else:
missing_arr = np.empty(self.shape, dtype=empty_dtype)
missing_arr.fill(fill_value)
return missing_arr
if not self.indexers:
if not self.block._can_consolidate:
# preserve these for validation in _concat_compat
return self.block.values
if self.block.is_bool and not self.block.is_categorical:
# External code requested filling/upcasting, bool values must
# be upcasted to object to avoid being upcasted to numeric.
values = self.block.astype(np.object_).values
elif self.block.is_extension:
values = self.block.values
else:
# No dtype upcasting is done here, it will be performed during
# concatenation itself.
values = self.block.get_values()
if not self.indexers:
# If there's no indexing to be done, we want to signal outside
# code that this array must be copied explicitly. This is done
# by returning a view and checking `retval.base`.
values = values.view()
else:
for ax, indexer in self.indexers.items():
values = algos.take_nd(values, indexer, axis=ax,
fill_value=fill_value)
return values
def _fast_count_smallints(arr):
"""Faster version of set(arr) for sequences of small numbers."""
if len(arr) == 0:
# Handle empty arr case separately: numpy 1.6 chokes on that.
return np.empty((0, 2), dtype=arr.dtype)
else:
counts = np.bincount(arr.astype(np.int_))
nz = counts.nonzero()[0]
return np.c_[nz, counts[nz]]
def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill):
if isinstance(slice_or_indexer, slice):
return ('slice', slice_or_indexer,
libinternals.slice_len(slice_or_indexer, length))
elif (isinstance(slice_or_indexer, np.ndarray) and
slice_or_indexer.dtype == np.bool_):
return 'mask', slice_or_indexer, slice_or_indexer.sum()
else:
indexer = np.asanyarray(slice_or_indexer, dtype=np.int64)
if not allow_fill:
indexer = maybe_convert_indices(indexer, length)
return 'fancy', indexer, len(indexer)