laywerrobot/lib/python3.6/site-packages/pandas/io/pytables.py

4817 lines
160 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
"""
High level interface to PyTables for reading and writing pandas data structures
to disk
"""
# pylint: disable-msg=E1101,W0613,W0603
from datetime import datetime, date
import time
import re
import copy
import itertools
import warnings
import os
from pandas.core.dtypes.common import (
is_list_like,
is_categorical_dtype,
is_timedelta64_dtype,
is_datetime64tz_dtype,
is_datetime64_dtype,
_ensure_object,
_ensure_int64,
_ensure_platform_int)
from pandas.core.dtypes.missing import array_equivalent
import numpy as np
from pandas import (Series, DataFrame, Panel, Index,
MultiIndex, Int64Index, isna, concat, to_datetime,
SparseSeries, SparseDataFrame, PeriodIndex,
DatetimeIndex, TimedeltaIndex)
from pandas.core import config
from pandas.io.common import _stringify_path
from pandas.core.sparse.array import BlockIndex, IntIndex
from pandas.core.base import StringMixin
from pandas.io.formats.printing import adjoin, pprint_thing
from pandas.errors import PerformanceWarning
import pandas.core.common as com
from pandas.core.algorithms import match, unique
from pandas.core.arrays.categorical import (Categorical,
_factorize_from_iterables)
from pandas.core.internals import (BlockManager, make_block,
_block2d_to_blocknd,
_factor_indexer, _block_shape)
from pandas.core.index import _ensure_index
from pandas import compat
from pandas.compat import u_safe as u, PY3, range, lrange, string_types, filter
from pandas.core.config import get_option
from pandas.core.computation.pytables import Expr, maybe_expression
from pandas._libs import algos, lib, writers as libwriters
from pandas._libs.tslibs import timezones
from distutils.version import LooseVersion
# versioning attribute
_version = '0.15.2'
# encoding
# PY3 encoding if we don't specify
_default_encoding = 'UTF-8'
def _ensure_decoded(s):
""" if we have bytes, decode them to unicode """
if isinstance(s, np.bytes_):
s = s.decode('UTF-8')
return s
def _ensure_encoding(encoding):
# set the encoding if we need
if encoding is None:
if PY3:
encoding = _default_encoding
return encoding
def _ensure_str(name):
"""Ensure that an index / column name is a str (python 3) or
unicode (python 2); otherwise they may be np.string dtype.
Non-string dtypes are passed through unchanged.
https://github.com/pandas-dev/pandas/issues/13492
"""
if isinstance(name, compat.string_types):
name = compat.text_type(name)
return name
Term = Expr
def _ensure_term(where, scope_level):
"""
ensure that the where is a Term or a list of Term
this makes sure that we are capturing the scope of variables
that are passed
create the terms here with a frame_level=2 (we are 2 levels down)
"""
# only consider list/tuple here as an ndarray is automatically a coordinate
# list
level = scope_level + 1
if isinstance(where, (list, tuple)):
wlist = []
for w in filter(lambda x: x is not None, where):
if not maybe_expression(w):
wlist.append(w)
else:
wlist.append(Term(w, scope_level=level))
where = wlist
elif maybe_expression(where):
where = Term(where, scope_level=level)
return where
class PossibleDataLossError(Exception):
pass
class ClosedFileError(Exception):
pass
class IncompatibilityWarning(Warning):
pass
incompatibility_doc = """
where criteria is being ignored as this version [%s] is too old (or
not-defined), read the file in and write it out to a new file to upgrade (with
the copy_to method)
"""
class AttributeConflictWarning(Warning):
pass
attribute_conflict_doc = """
the [%s] attribute of the existing index is [%s] which conflicts with the new
[%s], resetting the attribute to None
"""
class DuplicateWarning(Warning):
pass
duplicate_doc = """
duplicate entries in table, taking most recently appended
"""
performance_doc = """
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->%s,key->%s] [items->%s]
"""
# formats
_FORMAT_MAP = {
u('f'): 'fixed',
u('fixed'): 'fixed',
u('t'): 'table',
u('table'): 'table',
}
format_deprecate_doc = """
the table keyword has been deprecated
use the format='fixed(f)|table(t)' keyword instead
fixed(f) : specifies the Fixed format
and is the default for put operations
table(t) : specifies the Table format
and is the default for append operations
"""
# map object types
_TYPE_MAP = {
Series: u('series'),
SparseSeries: u('sparse_series'),
DataFrame: u('frame'),
SparseDataFrame: u('sparse_frame'),
Panel: u('wide'),
}
# storer class map
_STORER_MAP = {
u('Series'): 'LegacySeriesFixed',
u('DataFrame'): 'LegacyFrameFixed',
u('DataMatrix'): 'LegacyFrameFixed',
u('series'): 'SeriesFixed',
u('sparse_series'): 'SparseSeriesFixed',
u('frame'): 'FrameFixed',
u('sparse_frame'): 'SparseFrameFixed',
u('wide'): 'PanelFixed',
}
# table class map
_TABLE_MAP = {
u('generic_table'): 'GenericTable',
u('appendable_series'): 'AppendableSeriesTable',
u('appendable_multiseries'): 'AppendableMultiSeriesTable',
u('appendable_frame'): 'AppendableFrameTable',
u('appendable_multiframe'): 'AppendableMultiFrameTable',
u('appendable_panel'): 'AppendablePanelTable',
u('worm'): 'WORMTable',
u('legacy_frame'): 'LegacyFrameTable',
u('legacy_panel'): 'LegacyPanelTable',
}
# axes map
_AXES_MAP = {
DataFrame: [0],
Panel: [1, 2]
}
# register our configuration options
dropna_doc = """
: boolean
drop ALL nan rows when appending to a table
"""
format_doc = """
: format
default format writing format, if None, then
put will default to 'fixed' and append will default to 'table'
"""
with config.config_prefix('io.hdf'):
config.register_option('dropna_table', False, dropna_doc,
validator=config.is_bool)
config.register_option(
'default_format', None, format_doc,
validator=config.is_one_of_factory(['fixed', 'table', None])
)
# oh the troubles to reduce import time
_table_mod = None
_table_file_open_policy_is_strict = False
def _tables():
global _table_mod
global _table_file_open_policy_is_strict
if _table_mod is None:
import tables
_table_mod = tables
# version requirements
if LooseVersion(tables.__version__) < LooseVersion('3.0.0'):
raise ImportError("PyTables version >= 3.0.0 is required")
# set the file open policy
# return the file open policy; this changes as of pytables 3.1
# depending on the HDF5 version
try:
_table_file_open_policy_is_strict = (
tables.file._FILE_OPEN_POLICY == 'strict')
except:
pass
return _table_mod
# interface to/from ###
def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None,
append=None, **kwargs):
""" store this object, close it if we opened it """
if append:
f = lambda store: store.append(key, value, **kwargs)
else:
f = lambda store: store.put(key, value, **kwargs)
path_or_buf = _stringify_path(path_or_buf)
if isinstance(path_or_buf, string_types):
with HDFStore(path_or_buf, mode=mode, complevel=complevel,
complib=complib) as store:
f(store)
else:
f(path_or_buf)
def read_hdf(path_or_buf, key=None, mode='r', **kwargs):
"""
Read from the store, close it if we opened it.
Retrieve pandas object stored in file, optionally based on where
criteria
Parameters
----------
path_or_buf : string, buffer or path object
Path to the file to open, or an open :class:`pandas.HDFStore` object.
Supports any object implementing the ``__fspath__`` protocol.
This includes :class:`pathlib.Path` and py._path.local.LocalPath
objects.
.. versionadded:: 0.19.0 support for pathlib, py.path.
.. versionadded:: 0.21.0 support for __fspath__ proptocol.
key : object, optional
The group identifier in the store. Can be omitted if the HDF file
contains a single pandas object.
mode : {'r', 'r+', 'a'}, optional
Mode to use when opening the file. Ignored if path_or_buf is a
:class:`pandas.HDFStore`. Default is 'r'.
where : list, optional
A list of Term (or convertible) objects.
start : int, optional
Row number to start selection.
stop : int, optional
Row number to stop selection.
columns : list, optional
A list of columns names to return.
iterator : bool, optional
Return an iterator object.
chunksize : int, optional
Number of rows to include in an iteration when using an iterator.
errors : str, default 'strict'
Specifies how encoding and decoding errors are to be handled.
See the errors argument for :func:`open` for a full list
of options.
**kwargs
Additional keyword arguments passed to HDFStore.
Returns
-------
item : object
The selected object. Return type depends on the object stored.
See Also
--------
pandas.DataFrame.to_hdf : write a HDF file from a DataFrame
pandas.HDFStore : low-level access to HDF files
Examples
--------
>>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z'])
>>> df.to_hdf('./store.h5', 'data')
>>> reread = pd.read_hdf('./store.h5')
"""
if mode not in ['r', 'r+', 'a']:
raise ValueError('mode {0} is not allowed while performing a read. '
'Allowed modes are r, r+ and a.'.format(mode))
# grab the scope
if 'where' in kwargs:
kwargs['where'] = _ensure_term(kwargs['where'], scope_level=1)
if isinstance(path_or_buf, HDFStore):
if not path_or_buf.is_open:
raise IOError('The HDFStore must be open for reading.')
store = path_or_buf
auto_close = False
else:
path_or_buf = _stringify_path(path_or_buf)
if not isinstance(path_or_buf, string_types):
raise NotImplementedError('Support for generic buffers has not '
'been implemented.')
try:
exists = os.path.exists(path_or_buf)
# if filepath is too long
except (TypeError, ValueError):
exists = False
if not exists:
raise compat.FileNotFoundError(
'File %s does not exist' % path_or_buf)
store = HDFStore(path_or_buf, mode=mode, **kwargs)
# can't auto open/close if we are using an iterator
# so delegate to the iterator
auto_close = True
try:
if key is None:
groups = store.groups()
if len(groups) == 0:
raise ValueError('No dataset in HDF5 file.')
candidate_only_group = groups[0]
# For the HDF file to have only one dataset, all other groups
# should then be metadata groups for that candidate group. (This
# assumes that the groups() method enumerates parent groups
# before their children.)
for group_to_check in groups[1:]:
if not _is_metadata_of(group_to_check, candidate_only_group):
raise ValueError('key must be provided when HDF5 file '
'contains multiple datasets.')
key = candidate_only_group._v_pathname
return store.select(key, auto_close=auto_close, **kwargs)
except:
# if there is an error, close the store
try:
store.close()
except:
pass
raise
def _is_metadata_of(group, parent_group):
"""Check if a given group is a metadata group for a given parent_group."""
if group._v_depth <= parent_group._v_depth:
return False
current = group
while current._v_depth > 1:
parent = current._v_parent
if parent == parent_group and current._v_name == 'meta':
return True
current = current._v_parent
return False
class HDFStore(StringMixin):
"""
dict-like IO interface for storing pandas objects in PyTables
either Fixed or Table format.
Parameters
----------
path : string
File path to HDF5 file
mode : {'a', 'w', 'r', 'r+'}, default 'a'
``'r'``
Read-only; no data can be modified.
``'w'``
Write; a new file is created (an existing file with the same
name would be deleted).
``'a'``
Append; an existing file is opened for reading and writing,
and if the file does not exist it is created.
``'r+'``
It is similar to ``'a'``, but the file must already exist.
complevel : int, 0-9, default None
Specifies a compression level for data.
A value of 0 disables compression.
complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
Specifies the compression library to be used.
As of v0.20.2 these additional compressors for Blosc are supported
(default if no compressor specified: 'blosc:blosclz'):
{'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
'blosc:zlib', 'blosc:zstd'}.
Specifying a compression library which is not available issues
a ValueError.
fletcher32 : bool, default False
If applying compression use the fletcher32 checksum
Examples
--------
>>> from pandas import DataFrame
>>> from numpy.random import randn
>>> bar = DataFrame(randn(10, 4))
>>> store = HDFStore('test.h5')
>>> store['foo'] = bar # write to HDF5
>>> bar = store['foo'] # retrieve
>>> store.close()
"""
def __init__(self, path, mode=None, complevel=None, complib=None,
fletcher32=False, **kwargs):
try:
import tables # noqa
except ImportError as ex: # pragma: no cover
raise ImportError('HDFStore requires PyTables, "{ex}" problem '
'importing'.format(ex=str(ex)))
if complib is not None and complib not in tables.filters.all_complibs:
raise ValueError(
"complib only supports {libs} compression.".format(
libs=tables.filters.all_complibs))
if complib is None and complevel is not None:
complib = tables.filters.default_complib
self._path = _stringify_path(path)
if mode is None:
mode = 'a'
self._mode = mode
self._handle = None
self._complevel = complevel if complevel else 0
self._complib = complib
self._fletcher32 = fletcher32
self._filters = None
self.open(mode=mode, **kwargs)
def __fspath__(self):
return self._path
@property
def root(self):
""" return the root node """
self._check_if_open()
return self._handle.root
@property
def filename(self):
return self._path
def __getitem__(self, key):
return self.get(key)
def __setitem__(self, key, value):
self.put(key, value)
def __delitem__(self, key):
return self.remove(key)
def __getattr__(self, name):
""" allow attribute access to get stores """
try:
return self.get(name)
except:
pass
raise AttributeError("'%s' object has no attribute '%s'" %
(type(self).__name__, name))
def __contains__(self, key):
""" check for existence of this key
can match the exact pathname or the pathnm w/o the leading '/'
"""
node = self.get_node(key)
if node is not None:
name = node._v_pathname
if name == key or name[1:] == key:
return True
return False
def __len__(self):
return len(self.groups())
def __unicode__(self):
return '%s\nFile path: %s\n' % (type(self), pprint_thing(self._path))
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.close()
def keys(self):
"""
Return a (potentially unordered) list of the keys corresponding to the
objects stored in the HDFStore. These are ABSOLUTE path-names (e.g.
have the leading '/'
"""
return [n._v_pathname for n in self.groups()]
def __iter__(self):
return iter(self.keys())
def items(self):
"""
iterate on key->group
"""
for g in self.groups():
yield g._v_pathname, g
iteritems = items
def open(self, mode='a', **kwargs):
"""
Open the file in the specified mode
Parameters
----------
mode : {'a', 'w', 'r', 'r+'}, default 'a'
See HDFStore docstring or tables.open_file for info about modes
"""
tables = _tables()
if self._mode != mode:
# if we are changing a write mode to read, ok
if self._mode in ['a', 'w'] and mode in ['r', 'r+']:
pass
elif mode in ['w']:
# this would truncate, raise here
if self.is_open:
raise PossibleDataLossError(
"Re-opening the file [{0}] with mode [{1}] "
"will delete the current file!"
.format(self._path, self._mode)
)
self._mode = mode
# close and reopen the handle
if self.is_open:
self.close()
if self._complevel and self._complevel > 0:
self._filters = _tables().Filters(self._complevel, self._complib,
fletcher32=self._fletcher32)
try:
self._handle = tables.open_file(self._path, self._mode, **kwargs)
except (IOError) as e: # pragma: no cover
if 'can not be written' in str(e):
print('Opening %s in read-only mode' % self._path)
self._handle = tables.open_file(self._path, 'r', **kwargs)
else:
raise
except (ValueError) as e:
# trap PyTables >= 3.1 FILE_OPEN_POLICY exception
# to provide an updated message
if 'FILE_OPEN_POLICY' in str(e):
e = ValueError(
"PyTables [{version}] no longer supports opening multiple "
"files\n"
"even in read-only mode on this HDF5 version "
"[{hdf_version}]. You can accept this\n"
"and not open the same file multiple times at once,\n"
"upgrade the HDF5 version, or downgrade to PyTables 3.0.0 "
"which allows\n"
"files to be opened multiple times at once\n"
.format(version=tables.__version__,
hdf_version=tables.get_hdf5_version()))
raise e
except (Exception) as e:
# trying to read from a non-existent file causes an error which
# is not part of IOError, make it one
if self._mode == 'r' and 'Unable to open/create file' in str(e):
raise IOError(str(e))
raise
def close(self):
"""
Close the PyTables file handle
"""
if self._handle is not None:
self._handle.close()
self._handle = None
@property
def is_open(self):
"""
return a boolean indicating whether the file is open
"""
if self._handle is None:
return False
return bool(self._handle.isopen)
def flush(self, fsync=False):
"""
Force all buffered modifications to be written to disk.
Parameters
----------
fsync : bool (default False)
call ``os.fsync()`` on the file handle to force writing to disk.
Notes
-----
Without ``fsync=True``, flushing may not guarantee that the OS writes
to disk. With fsync, the operation will block until the OS claims the
file has been written; however, other caching layers may still
interfere.
"""
if self._handle is not None:
self._handle.flush()
if fsync:
try:
os.fsync(self._handle.fileno())
except:
pass
def get(self, key):
"""
Retrieve pandas object stored in file
Parameters
----------
key : object
Returns
-------
obj : type of object stored in file
"""
group = self.get_node(key)
if group is None:
raise KeyError('No object named %s in the file' % key)
return self._read_group(group)
def select(self, key, where=None, start=None, stop=None, columns=None,
iterator=False, chunksize=None, auto_close=False, **kwargs):
"""
Retrieve pandas object stored in file, optionally based on where
criteria
Parameters
----------
key : object
where : list of Term (or convertible) objects, optional
start : integer (defaults to None), row number to start selection
stop : integer (defaults to None), row number to stop selection
columns : a list of columns that if not None, will limit the return
columns
iterator : boolean, return an iterator, default False
chunksize : nrows to include in iteration, return an iterator
auto_close : boolean, should automatically close the store when
finished, default is False
Returns
-------
The selected object
"""
group = self.get_node(key)
if group is None:
raise KeyError('No object named %s in the file' % key)
# create the storer and axes
where = _ensure_term(where, scope_level=1)
s = self._create_storer(group)
s.infer_axes()
# function to call on iteration
def func(_start, _stop, _where):
return s.read(start=_start, stop=_stop,
where=_where,
columns=columns)
# create the iterator
it = TableIterator(self, s, func, where=where, nrows=s.nrows,
start=start, stop=stop, iterator=iterator,
chunksize=chunksize, auto_close=auto_close)
return it.get_result()
def select_as_coordinates(
self, key, where=None, start=None, stop=None, **kwargs):
"""
return the selection as an Index
Parameters
----------
key : object
where : list of Term (or convertible) objects, optional
start : integer (defaults to None), row number to start selection
stop : integer (defaults to None), row number to stop selection
"""
where = _ensure_term(where, scope_level=1)
return self.get_storer(key).read_coordinates(where=where, start=start,
stop=stop, **kwargs)
def select_column(self, key, column, **kwargs):
"""
return a single column from the table. This is generally only useful to
select an indexable
Parameters
----------
key : object
column: the column of interest
Exceptions
----------
raises KeyError if the column is not found (or key is not a valid
store)
raises ValueError if the column can not be extracted individually (it
is part of a data block)
"""
return self.get_storer(key).read_column(column=column, **kwargs)
def select_as_multiple(self, keys, where=None, selector=None, columns=None,
start=None, stop=None, iterator=False,
chunksize=None, auto_close=False, **kwargs):
""" Retrieve pandas objects from multiple tables
Parameters
----------
keys : a list of the tables
selector : the table to apply the where criteria (defaults to keys[0]
if not supplied)
columns : the columns I want back
start : integer (defaults to None), row number to start selection
stop : integer (defaults to None), row number to stop selection
iterator : boolean, return an iterator, default False
chunksize : nrows to include in iteration, return an iterator
Exceptions
----------
raises KeyError if keys or selector is not found or keys is empty
raises TypeError if keys is not a list or tuple
raises ValueError if the tables are not ALL THE SAME DIMENSIONS
"""
# default to single select
where = _ensure_term(where, scope_level=1)
if isinstance(keys, (list, tuple)) and len(keys) == 1:
keys = keys[0]
if isinstance(keys, string_types):
return self.select(key=keys, where=where, columns=columns,
start=start, stop=stop, iterator=iterator,
chunksize=chunksize, **kwargs)
if not isinstance(keys, (list, tuple)):
raise TypeError("keys must be a list/tuple")
if not len(keys):
raise ValueError("keys must have a non-zero length")
if selector is None:
selector = keys[0]
# collect the tables
tbls = [self.get_storer(k) for k in keys]
s = self.get_storer(selector)
# validate rows
nrows = None
for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
if t is None:
raise KeyError("Invalid table [%s]" % k)
if not t.is_table:
raise TypeError(
"object [%s] is not a table, and cannot be used in all "
"select as multiple" % t.pathname
)
if nrows is None:
nrows = t.nrows
elif t.nrows != nrows:
raise ValueError(
"all tables must have exactly the same nrows!")
# axis is the concentation axes
axis = list({t.non_index_axes[0][0] for t in tbls})[0]
def func(_start, _stop, _where):
# retrieve the objs, _where is always passed as a set of
# coordinates here
objs = [t.read(where=_where, columns=columns, start=_start,
stop=_stop, **kwargs) for t in tbls]
# concat and return
return concat(objs, axis=axis,
verify_integrity=False)._consolidate()
# create the iterator
it = TableIterator(self, s, func, where=where, nrows=nrows,
start=start, stop=stop, iterator=iterator,
chunksize=chunksize, auto_close=auto_close)
return it.get_result(coordinates=True)
def put(self, key, value, format=None, append=False, **kwargs):
"""
Store object in HDFStore
Parameters
----------
key : object
value : {Series, DataFrame, Panel}
format : 'fixed(f)|table(t)', default is 'fixed'
fixed(f) : Fixed format
Fast writing/reading. Not-appendable, nor searchable
table(t) : Table format
Write as a PyTables Table structure which may perform
worse but allow more flexible operations like searching
/ selecting subsets of the data
append : boolean, default False
This will force Table format, append the input data to the
existing.
data_columns : list of columns to create as data columns, or True to
use all columns. See
`here <http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns>`__ # noqa
encoding : default None, provide an encoding for strings
dropna : boolean, default False, do not write an ALL nan row to
the store settable by the option 'io.hdf.dropna_table'
"""
if format is None:
format = get_option("io.hdf.default_format") or 'fixed'
kwargs = self._validate_format(format, kwargs)
self._write_to_group(key, value, append=append, **kwargs)
def remove(self, key, where=None, start=None, stop=None):
"""
Remove pandas object partially by specifying the where condition
Parameters
----------
key : string
Node to remove or delete rows from
where : list of Term (or convertible) objects, optional
start : integer (defaults to None), row number to start selection
stop : integer (defaults to None), row number to stop selection
Returns
-------
number of rows removed (or None if not a Table)
Exceptions
----------
raises KeyError if key is not a valid store
"""
where = _ensure_term(where, scope_level=1)
try:
s = self.get_storer(key)
except KeyError:
# the key is not a valid store, re-raising KeyError
raise
except Exception:
if where is not None:
raise ValueError(
"trying to remove a node with a non-None where clause!")
# we are actually trying to remove a node (with children)
s = self.get_node(key)
if s is not None:
s._f_remove(recursive=True)
return None
# remove the node
if com._all_none(where, start, stop):
s.group._f_remove(recursive=True)
# delete from the table
else:
if not s.is_table:
raise ValueError(
'can only remove with where on objects written as tables')
return s.delete(where=where, start=start, stop=stop)
def append(self, key, value, format=None, append=True, columns=None,
dropna=None, **kwargs):
"""
Append to Table in file. Node must already exist and be Table
format.
Parameters
----------
key : object
value : {Series, DataFrame, Panel}
format: 'table' is the default
table(t) : table format
Write as a PyTables Table structure which may perform
worse but allow more flexible operations like searching
/ selecting subsets of the data
append : boolean, default True, append the input data to the
existing
data_columns : list of columns, or True, default None
List of columns to create as indexed data columns for on-disk
queries, or True to use all columns. By default only the axes
of the object are indexed. See `here
<http://pandas.pydata.org/pandas-docs/stable/io.html#query-via-data-columns>`__.
min_itemsize : dict of columns that specify minimum string sizes
nan_rep : string to use as string nan represenation
chunksize : size to chunk the writing
expectedrows : expected TOTAL row size of this table
encoding : default None, provide an encoding for strings
dropna : boolean, default False, do not write an ALL nan row to
the store settable by the option 'io.hdf.dropna_table'
Notes
-----
Does *not* check if data being appended overlaps with existing
data in the table, so be careful
"""
if columns is not None:
raise TypeError("columns is not a supported keyword in append, "
"try data_columns")
if dropna is None:
dropna = get_option("io.hdf.dropna_table")
if format is None:
format = get_option("io.hdf.default_format") or 'table'
kwargs = self._validate_format(format, kwargs)
self._write_to_group(key, value, append=append, dropna=dropna,
**kwargs)
def append_to_multiple(self, d, value, selector, data_columns=None,
axes=None, dropna=False, **kwargs):
"""
Append to multiple tables
Parameters
----------
d : a dict of table_name to table_columns, None is acceptable as the
values of one node (this will get all the remaining columns)
value : a pandas object
selector : a string that designates the indexable table; all of its
columns will be designed as data_columns, unless data_columns is
passed, in which case these are used
data_columns : list of columns to create as data columns, or True to
use all columns
dropna : if evaluates to True, drop rows from all tables if any single
row in each table has all NaN. Default False.
Notes
-----
axes parameter is currently not accepted
"""
if axes is not None:
raise TypeError("axes is currently not accepted as a parameter to"
" append_to_multiple; you can create the "
"tables independently instead")
if not isinstance(d, dict):
raise ValueError(
"append_to_multiple must have a dictionary specified as the "
"way to split the value"
)
if selector not in d:
raise ValueError(
"append_to_multiple requires a selector that is in passed dict"
)
# figure out the splitting axis (the non_index_axis)
axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]
# figure out how to split the value
remain_key = None
remain_values = []
for k, v in d.items():
if v is None:
if remain_key is not None:
raise ValueError(
"append_to_multiple can only have one value in d that "
"is None"
)
remain_key = k
else:
remain_values.extend(v)
if remain_key is not None:
ordered = value.axes[axis]
ordd = ordered.difference(Index(remain_values))
ordd = sorted(ordered.get_indexer(ordd))
d[remain_key] = ordered.take(ordd)
# data_columns
if data_columns is None:
data_columns = d[selector]
# ensure rows are synchronized across the tables
if dropna:
idxs = (value[cols].dropna(how='all').index for cols in d.values())
valid_index = next(idxs)
for index in idxs:
valid_index = valid_index.intersection(index)
value = value.loc[valid_index]
# append
for k, v in d.items():
dc = data_columns if k == selector else None
# compute the val
val = value.reindex(v, axis=axis)
self.append(k, val, data_columns=dc, **kwargs)
def create_table_index(self, key, **kwargs):
""" Create a pytables index on the table
Parameters
----------
key : object (the node to index)
Exceptions
----------
raises if the node is not a table
"""
# version requirements
_tables()
s = self.get_storer(key)
if s is None:
return
if not s.is_table:
raise TypeError(
"cannot create table index on a Fixed format store")
s.create_index(**kwargs)
def groups(self):
"""return a list of all the top-level nodes (that are not themselves a
pandas storage object)
"""
_tables()
self._check_if_open()
return [
g for g in self._handle.walk_nodes()
if (not isinstance(g, _table_mod.link.Link) and
(getattr(g._v_attrs, 'pandas_type', None) or
getattr(g, 'table', None) or
(isinstance(g, _table_mod.table.Table) and
g._v_name != u('table'))))
]
def get_node(self, key):
""" return the node with the key or None if it does not exist """
self._check_if_open()
try:
if not key.startswith('/'):
key = '/' + key
return self._handle.get_node(self.root, key)
except:
return None
def get_storer(self, key):
""" return the storer object for a key, raise if not in the file """
group = self.get_node(key)
if group is None:
raise KeyError('No object named {} in the file'.format(key))
s = self._create_storer(group)
s.infer_axes()
return s
def copy(self, file, mode='w', propindexes=True, keys=None, complib=None,
complevel=None, fletcher32=False, overwrite=True):
""" copy the existing store to a new file, upgrading in place
Parameters
----------
propindexes: restore indexes in copied file (defaults to True)
keys : list of keys to include in the copy (defaults to all)
overwrite : overwrite (remove and replace) existing nodes in the
new store (default is True)
mode, complib, complevel, fletcher32 same as in HDFStore.__init__
Returns
-------
open file handle of the new store
"""
new_store = HDFStore(
file,
mode=mode,
complib=complib,
complevel=complevel,
fletcher32=fletcher32)
if keys is None:
keys = list(self.keys())
if not isinstance(keys, (tuple, list)):
keys = [keys]
for k in keys:
s = self.get_storer(k)
if s is not None:
if k in new_store:
if overwrite:
new_store.remove(k)
data = self.select(k)
if s.is_table:
index = False
if propindexes:
index = [a.name for a in s.axes if a.is_indexed]
new_store.append(
k, data, index=index,
data_columns=getattr(s, 'data_columns', None),
encoding=s.encoding
)
else:
new_store.put(k, data, encoding=s.encoding)
return new_store
def info(self):
"""
print detailed information on the store
.. versionadded:: 0.21.0
"""
output = '%s\nFile path: %s\n' % (type(self), pprint_thing(self._path))
if self.is_open:
lkeys = sorted(list(self.keys()))
if len(lkeys):
keys = []
values = []
for k in lkeys:
try:
s = self.get_storer(k)
if s is not None:
keys.append(pprint_thing(s.pathname or k))
values.append(
pprint_thing(s or 'invalid_HDFStore node'))
except Exception as detail:
keys.append(k)
values.append("[invalid_HDFStore node: %s]"
% pprint_thing(detail))
output += adjoin(12, keys, values)
else:
output += 'Empty'
else:
output += "File is CLOSED"
return output
# private methods ######
def _check_if_open(self):
if not self.is_open:
raise ClosedFileError("{0} file is not open!".format(self._path))
def _validate_format(self, format, kwargs):
""" validate / deprecate formats; return the new kwargs """
kwargs = kwargs.copy()
# validate
try:
kwargs['format'] = _FORMAT_MAP[format.lower()]
except:
raise TypeError("invalid HDFStore format specified [{0}]"
.format(format))
return kwargs
def _create_storer(self, group, format=None, value=None, append=False,
**kwargs):
""" return a suitable class to operate """
def error(t):
raise TypeError(
"cannot properly create the storer for: [%s] [group->%s,"
"value->%s,format->%s,append->%s,kwargs->%s]"
% (t, group, type(value), format, append, kwargs)
)
pt = _ensure_decoded(getattr(group._v_attrs, 'pandas_type', None))
tt = _ensure_decoded(getattr(group._v_attrs, 'table_type', None))
# infer the pt from the passed value
if pt is None:
if value is None:
_tables()
if (getattr(group, 'table', None) or
isinstance(group, _table_mod.table.Table)):
pt = u('frame_table')
tt = u('generic_table')
else:
raise TypeError(
"cannot create a storer if the object is not existing "
"nor a value are passed")
else:
try:
pt = _TYPE_MAP[type(value)]
except:
error('_TYPE_MAP')
# we are actually a table
if format == 'table':
pt += u('_table')
# a storer node
if u('table') not in pt:
try:
return globals()[_STORER_MAP[pt]](self, group, **kwargs)
except:
error('_STORER_MAP')
# existing node (and must be a table)
if tt is None:
# if we are a writer, determine the tt
if value is not None:
if pt == u('series_table'):
index = getattr(value, 'index', None)
if index is not None:
if index.nlevels == 1:
tt = u('appendable_series')
elif index.nlevels > 1:
tt = u('appendable_multiseries')
elif pt == u('frame_table'):
index = getattr(value, 'index', None)
if index is not None:
if index.nlevels == 1:
tt = u('appendable_frame')
elif index.nlevels > 1:
tt = u('appendable_multiframe')
elif pt == u('wide_table'):
tt = u('appendable_panel')
elif pt == u('ndim_table'):
tt = u('appendable_ndim')
else:
# distiguish between a frame/table
tt = u('legacy_panel')
try:
fields = group.table._v_attrs.fields
if len(fields) == 1 and fields[0] == u('value'):
tt = u('legacy_frame')
except:
pass
try:
return globals()[_TABLE_MAP[tt]](self, group, **kwargs)
except:
error('_TABLE_MAP')
def _write_to_group(self, key, value, format, index=True, append=False,
complib=None, encoding=None, **kwargs):
group = self.get_node(key)
# remove the node if we are not appending
if group is not None and not append:
self._handle.remove_node(group, recursive=True)
group = None
# we don't want to store a table node at all if are object is 0-len
# as there are not dtypes
if getattr(value, 'empty', None) and (format == 'table' or append):
return
if group is None:
paths = key.split('/')
# recursively create the groups
path = '/'
for p in paths:
if not len(p):
continue
new_path = path
if not path.endswith('/'):
new_path += '/'
new_path += p
group = self.get_node(new_path)
if group is None:
group = self._handle.create_group(path, p)
path = new_path
s = self._create_storer(group, format, value, append=append,
encoding=encoding, **kwargs)
if append:
# raise if we are trying to append to a Fixed format,
# or a table that exists (and we are putting)
if (not s.is_table or
(s.is_table and format == 'fixed' and s.is_exists)):
raise ValueError('Can only append to Tables')
if not s.is_exists:
s.set_object_info()
else:
s.set_object_info()
if not s.is_table and complib:
raise ValueError(
'Compression not supported on Fixed format stores'
)
# write the object
s.write(obj=value, append=append, complib=complib, **kwargs)
if s.is_table and index:
s.create_index(columns=index)
def _read_group(self, group, **kwargs):
s = self._create_storer(group)
s.infer_axes()
return s.read(**kwargs)
def get_store(path, **kwargs):
""" Backwards compatible alias for ``HDFStore``
"""
warnings.warn(
"get_store is deprecated and be "
"removed in a future version\n"
"HDFStore(path, **kwargs) is the replacement",
FutureWarning,
stacklevel=6)
return HDFStore(path, **kwargs)
class TableIterator(object):
""" define the iteration interface on a table
Parameters
----------
store : the reference store
s : the referred storer
func : the function to execute the query
where : the where of the query
nrows : the rows to iterate on
start : the passed start value (default is None)
stop : the passed stop value (default is None)
iterator : boolean, whether to use the default iterator
chunksize : the passed chunking value (default is 50000)
auto_close : boolean, automatically close the store at the end of
iteration, default is False
kwargs : the passed kwargs
"""
def __init__(self, store, s, func, where, nrows, start=None, stop=None,
iterator=False, chunksize=None, auto_close=False):
self.store = store
self.s = s
self.func = func
self.where = where
# set start/stop if they are not set if we are a table
if self.s.is_table:
if nrows is None:
nrows = 0
if start is None:
start = 0
if stop is None:
stop = nrows
stop = min(nrows, stop)
self.nrows = nrows
self.start = start
self.stop = stop
self.coordinates = None
if iterator or chunksize is not None:
if chunksize is None:
chunksize = 100000
self.chunksize = int(chunksize)
else:
self.chunksize = None
self.auto_close = auto_close
def __iter__(self):
# iterate
current = self.start
while current < self.stop:
stop = min(current + self.chunksize, self.stop)
value = self.func(None, None, self.coordinates[current:stop])
current = stop
if value is None or not len(value):
continue
yield value
self.close()
def close(self):
if self.auto_close:
self.store.close()
def get_result(self, coordinates=False):
# return the actual iterator
if self.chunksize is not None:
if not self.s.is_table:
raise TypeError(
"can only use an iterator or chunksize on a table")
self.coordinates = self.s.read_coordinates(where=self.where)
return self
# if specified read via coordinates (necessary for multiple selections
if coordinates:
where = self.s.read_coordinates(where=self.where, start=self.start,
stop=self.stop)
else:
where = self.where
# directly return the result
results = self.func(self.start, self.stop, where)
self.close()
return results
class IndexCol(StringMixin):
""" an index column description class
Parameters
----------
axis : axis which I reference
values : the ndarray like converted values
kind : a string description of this type
typ : the pytables type
pos : the position in the pytables
"""
is_an_indexable = True
is_data_indexable = True
_info_fields = ['freq', 'tz', 'index_name']
def __init__(self, values=None, kind=None, typ=None, cname=None,
itemsize=None, name=None, axis=None, kind_attr=None,
pos=None, freq=None, tz=None, index_name=None, **kwargs):
self.values = values
self.kind = kind
self.typ = typ
self.itemsize = itemsize
self.name = name
self.cname = cname
self.kind_attr = kind_attr
self.axis = axis
self.pos = pos
self.freq = freq
self.tz = tz
self.index_name = index_name
self.table = None
self.meta = None
self.metadata = None
if name is not None:
self.set_name(name, kind_attr)
if pos is not None:
self.set_pos(pos)
def set_name(self, name, kind_attr=None):
""" set the name of this indexer """
self.name = name
self.kind_attr = kind_attr or "%s_kind" % name
if self.cname is None:
self.cname = name
return self
def set_axis(self, axis):
""" set the axis over which I index """
self.axis = axis
return self
def set_pos(self, pos):
""" set the position of this column in the Table """
self.pos = pos
if pos is not None and self.typ is not None:
self.typ._v_pos = pos
return self
def set_table(self, table):
self.table = table
return self
def __unicode__(self):
temp = tuple(
map(pprint_thing,
(self.name,
self.cname,
self.axis,
self.pos,
self.kind)))
return "name->%s,cname->%s,axis->%s,pos->%s,kind->%s" % temp
def __eq__(self, other):
""" compare 2 col items """
return all(getattr(self, a, None) == getattr(other, a, None)
for a in ['name', 'cname', 'axis', 'pos'])
def __ne__(self, other):
return not self.__eq__(other)
@property
def is_indexed(self):
""" return whether I am an indexed column """
try:
return getattr(self.table.cols, self.cname).is_indexed
except:
False
def copy(self):
new_self = copy.copy(self)
return new_self
def infer(self, handler):
"""infer this column from the table: create and return a new object"""
table = handler.table
new_self = self.copy()
new_self.set_table(table)
new_self.get_attr()
new_self.read_metadata(handler)
return new_self
def convert(self, values, nan_rep, encoding, errors):
""" set the values from this selection: take = take ownership """
# values is a recarray
if values.dtype.fields is not None:
values = values[self.cname]
values = _maybe_convert(values, self.kind, encoding, errors)
kwargs = dict()
if self.freq is not None:
kwargs['freq'] = _ensure_decoded(self.freq)
if self.index_name is not None:
kwargs['name'] = _ensure_decoded(self.index_name)
try:
self.values = Index(values, **kwargs)
except:
# if the output freq is different that what we recorded,
# it should be None (see also 'doc example part 2')
if 'freq' in kwargs:
kwargs['freq'] = None
self.values = Index(values, **kwargs)
self.values = _set_tz(self.values, self.tz)
return self
def take_data(self):
""" return the values & release the memory """
self.values, values = None, self.values
return values
@property
def attrs(self):
return self.table._v_attrs
@property
def description(self):
return self.table.description
@property
def col(self):
""" return my current col description """
return getattr(self.description, self.cname, None)
@property
def cvalues(self):
""" return my cython values """
return self.values
def __iter__(self):
return iter(self.values)
def maybe_set_size(self, min_itemsize=None, **kwargs):
""" maybe set a string col itemsize:
min_itemsize can be an integer or a dict with this columns name
with an integer size """
if _ensure_decoded(self.kind) == u('string'):
if isinstance(min_itemsize, dict):
min_itemsize = min_itemsize.get(self.name)
if min_itemsize is not None and self.typ.itemsize < min_itemsize:
self.typ = _tables(
).StringCol(itemsize=min_itemsize, pos=self.pos)
def validate(self, handler, append, **kwargs):
self.validate_names()
def validate_names(self):
pass
def validate_and_set(self, handler, append, **kwargs):
self.set_table(handler.table)
self.validate_col()
self.validate_attr(append)
self.validate_metadata(handler)
self.write_metadata(handler)
self.set_attr()
def validate_col(self, itemsize=None):
""" validate this column: return the compared against itemsize """
# validate this column for string truncation (or reset to the max size)
if _ensure_decoded(self.kind) == u('string'):
c = self.col
if c is not None:
if itemsize is None:
itemsize = self.itemsize
if c.itemsize < itemsize:
raise ValueError(
"Trying to store a string with len [%s] in [%s] "
"column but\nthis column has a limit of [%s]!\n"
"Consider using min_itemsize to preset the sizes on "
"these columns" % (itemsize, self.cname, c.itemsize))
return c.itemsize
return None
def validate_attr(self, append):
# check for backwards incompatibility
if append:
existing_kind = getattr(self.attrs, self.kind_attr, None)
if existing_kind is not None and existing_kind != self.kind:
raise TypeError("incompatible kind in col [%s - %s]" %
(existing_kind, self.kind))
def update_info(self, info):
""" set/update the info for this indexable with the key/value
if there is a conflict raise/warn as needed """
for key in self._info_fields:
value = getattr(self, key, None)
idx = _get_info(info, self.name)
existing_value = idx.get(key)
if key in idx and value is not None and existing_value != value:
# frequency/name just warn
if key in ['freq', 'index_name']:
ws = attribute_conflict_doc % (key, existing_value, value)
warnings.warn(ws, AttributeConflictWarning, stacklevel=6)
# reset
idx[key] = None
setattr(self, key, None)
else:
raise ValueError(
"invalid info for [%s] for [%s], existing_value [%s] "
"conflicts with new value [%s]"
% (self.name, key, existing_value, value))
else:
if value is not None or existing_value is not None:
idx[key] = value
return self
def set_info(self, info):
""" set my state from the passed info """
idx = info.get(self.name)
if idx is not None:
self.__dict__.update(idx)
def get_attr(self):
""" set the kind for this column """
self.kind = getattr(self.attrs, self.kind_attr, None)
def set_attr(self):
""" set the kind for this column """
setattr(self.attrs, self.kind_attr, self.kind)
def read_metadata(self, handler):
""" retrieve the metadata for this columns """
self.metadata = handler.read_metadata(self.cname)
def validate_metadata(self, handler):
""" validate that kind=category does not change the categories """
if self.meta == 'category':
new_metadata = self.metadata
cur_metadata = handler.read_metadata(self.cname)
if new_metadata is not None and cur_metadata is not None \
and not array_equivalent(new_metadata, cur_metadata):
raise ValueError("cannot append a categorical with "
"different categories to the existing")
def write_metadata(self, handler):
""" set the meta data """
if self.metadata is not None:
handler.write_metadata(self.cname, self.metadata)
class GenericIndexCol(IndexCol):
""" an index which is not represented in the data of the table """
@property
def is_indexed(self):
return False
def convert(self, values, nan_rep, encoding, errors):
""" set the values from this selection: take = take ownership """
self.values = Int64Index(np.arange(self.table.nrows))
return self
def get_attr(self):
pass
def set_attr(self):
pass
class DataCol(IndexCol):
""" a data holding column, by definition this is not indexable
Parameters
----------
data : the actual data
cname : the column name in the table to hold the data (typically
values)
meta : a string description of the metadata
metadata : the actual metadata
"""
is_an_indexable = False
is_data_indexable = False
_info_fields = ['tz', 'ordered']
@classmethod
def create_for_block(
cls, i=None, name=None, cname=None, version=None, **kwargs):
""" return a new datacol with the block i """
if cname is None:
cname = name or 'values_block_%d' % i
if name is None:
name = cname
# prior to 0.10.1, we named values blocks like: values_block_0 an the
# name values_0
try:
if version[0] == 0 and version[1] <= 10 and version[2] == 0:
m = re.search(r"values_block_(\d+)", name)
if m:
name = "values_%s" % m.groups()[0]
except:
pass
return cls(name=name, cname=cname, **kwargs)
def __init__(self, values=None, kind=None, typ=None,
cname=None, data=None, meta=None, metadata=None,
block=None, **kwargs):
super(DataCol, self).__init__(values=values, kind=kind, typ=typ,
cname=cname, **kwargs)
self.dtype = None
self.dtype_attr = u("%s_dtype" % self.name)
self.meta = meta
self.meta_attr = u("%s_meta" % self.name)
self.set_data(data)
self.set_metadata(metadata)
def __unicode__(self):
temp = tuple(
map(pprint_thing,
(self.name,
self.cname,
self.dtype,
self.kind,
self.shape)))
return "name->%s,cname->%s,dtype->%s,kind->%s,shape->%s" % temp
def __eq__(self, other):
""" compare 2 col items """
return all(getattr(self, a, None) == getattr(other, a, None)
for a in ['name', 'cname', 'dtype', 'pos'])
def set_data(self, data, dtype=None):
self.data = data
if data is not None:
if dtype is not None:
self.dtype = dtype
self.set_kind()
elif self.dtype is None:
self.dtype = data.dtype.name
self.set_kind()
def take_data(self):
""" return the data & release the memory """
self.data, data = None, self.data
return data
def set_metadata(self, metadata):
""" record the metadata """
if metadata is not None:
metadata = np.array(metadata, copy=False).ravel()
self.metadata = metadata
def set_kind(self):
# set my kind if we can
if self.dtype is not None:
dtype = _ensure_decoded(self.dtype)
if dtype.startswith(u('string')) or dtype.startswith(u('bytes')):
self.kind = 'string'
elif dtype.startswith(u('float')):
self.kind = 'float'
elif dtype.startswith(u('complex')):
self.kind = 'complex'
elif dtype.startswith(u('int')) or dtype.startswith(u('uint')):
self.kind = 'integer'
elif dtype.startswith(u('date')):
self.kind = 'datetime'
elif dtype.startswith(u('timedelta')):
self.kind = 'timedelta'
elif dtype.startswith(u('bool')):
self.kind = 'bool'
else:
raise AssertionError(
"cannot interpret dtype of [%s] in [%s]" % (dtype, self))
# set my typ if we need
if self.typ is None:
self.typ = getattr(self.description, self.cname, None)
def set_atom(self, block, block_items, existing_col, min_itemsize,
nan_rep, info, encoding=None, errors='strict'):
""" create and setup my atom from the block b """
self.values = list(block_items)
# short-cut certain block types
if block.is_categorical:
return self.set_atom_categorical(block, items=block_items,
info=info)
elif block.is_datetimetz:
return self.set_atom_datetime64tz(block, info=info)
elif block.is_datetime:
return self.set_atom_datetime64(block)
elif block.is_timedelta:
return self.set_atom_timedelta64(block)
elif block.is_complex:
return self.set_atom_complex(block)
dtype = block.dtype.name
inferred_type = lib.infer_dtype(block.values)
if inferred_type == 'date':
raise TypeError(
"[date] is not implemented as a table column")
elif inferred_type == 'datetime':
# after 8260
# this only would be hit for a mutli-timezone dtype
# which is an error
raise TypeError(
"too many timezones in this block, create separate "
"data columns"
)
elif inferred_type == 'unicode':
raise TypeError(
"[unicode] is not implemented as a table column")
# this is basically a catchall; if say a datetime64 has nans then will
# end up here ###
elif inferred_type == 'string' or dtype == 'object':
self.set_atom_string(
block, block_items,
existing_col,
min_itemsize,
nan_rep,
encoding,
errors)
# set as a data block
else:
self.set_atom_data(block)
def get_atom_string(self, block, itemsize):
return _tables().StringCol(itemsize=itemsize, shape=block.shape[0])
def set_atom_string(self, block, block_items, existing_col, min_itemsize,
nan_rep, encoding, errors):
# fill nan items with myself, don't disturb the blocks by
# trying to downcast
block = block.fillna(nan_rep, downcast=False)
if isinstance(block, list):
block = block[0]
data = block.values
# see if we have a valid string type
inferred_type = lib.infer_dtype(data.ravel())
if inferred_type != 'string':
# we cannot serialize this data, so report an exception on a column
# by column basis
for i, item in enumerate(block_items):
col = block.iget(i)
inferred_type = lib.infer_dtype(col.ravel())
if inferred_type != 'string':
raise TypeError(
"Cannot serialize the column [%s] because\n"
"its data contents are [%s] object dtype"
% (item, inferred_type)
)
# itemsize is the maximum length of a string (along any dimension)
data_converted = _convert_string_array(data, encoding, errors)
itemsize = data_converted.itemsize
# specified min_itemsize?
if isinstance(min_itemsize, dict):
min_itemsize = int(min_itemsize.get(
self.name) or min_itemsize.get('values') or 0)
itemsize = max(min_itemsize or 0, itemsize)
# check for column in the values conflicts
if existing_col is not None:
eci = existing_col.validate_col(itemsize)
if eci > itemsize:
itemsize = eci
self.itemsize = itemsize
self.kind = 'string'
self.typ = self.get_atom_string(block, itemsize)
self.set_data(data_converted.astype('|S%d' % itemsize, copy=False))
def get_atom_coltype(self, kind=None):
""" return the PyTables column class for this column """
if kind is None:
kind = self.kind
if self.kind.startswith('uint'):
col_name = "UInt%sCol" % kind[4:]
else:
col_name = "%sCol" % kind.capitalize()
return getattr(_tables(), col_name)
def get_atom_data(self, block, kind=None):
return self.get_atom_coltype(kind=kind)(shape=block.shape[0])
def set_atom_complex(self, block):
self.kind = block.dtype.name
itemsize = int(self.kind.split('complex')[-1]) // 8
self.typ = _tables().ComplexCol(
itemsize=itemsize, shape=block.shape[0])
self.set_data(block.values.astype(self.typ.type, copy=False))
def set_atom_data(self, block):
self.kind = block.dtype.name
self.typ = self.get_atom_data(block)
self.set_data(block.values.astype(self.typ.type, copy=False))
def set_atom_categorical(self, block, items, info=None, values=None):
# currently only supports a 1-D categorical
# in a 1-D block
values = block.values
codes = values.codes
self.kind = 'integer'
self.dtype = codes.dtype.name
if values.ndim > 1:
raise NotImplementedError("only support 1-d categoricals")
if len(items) > 1:
raise NotImplementedError("only support single block categoricals")
# write the codes; must be in a block shape
self.ordered = values.ordered
self.typ = self.get_atom_data(block, kind=codes.dtype.name)
self.set_data(_block_shape(codes))
# write the categories
self.meta = 'category'
self.set_metadata(block.values.categories)
# update the info
self.update_info(info)
def get_atom_datetime64(self, block):
return _tables().Int64Col(shape=block.shape[0])
def set_atom_datetime64(self, block, values=None):
self.kind = 'datetime64'
self.typ = self.get_atom_datetime64(block)
if values is None:
values = block.values.view('i8')
self.set_data(values, 'datetime64')
def set_atom_datetime64tz(self, block, info, values=None):
if values is None:
values = block.values
# convert this column to i8 in UTC, and save the tz
values = values.asi8.reshape(block.shape)
# store a converted timezone
self.tz = _get_tz(block.values.tz)
self.update_info(info)
self.kind = 'datetime64'
self.typ = self.get_atom_datetime64(block)
self.set_data(values, 'datetime64')
def get_atom_timedelta64(self, block):
return _tables().Int64Col(shape=block.shape[0])
def set_atom_timedelta64(self, block, values=None):
self.kind = 'timedelta64'
self.typ = self.get_atom_timedelta64(block)
if values is None:
values = block.values.view('i8')
self.set_data(values, 'timedelta64')
@property
def shape(self):
return getattr(self.data, 'shape', None)
@property
def cvalues(self):
""" return my cython values """
return self.data
def validate_attr(self, append):
"""validate that we have the same order as the existing & same dtype"""
if append:
existing_fields = getattr(self.attrs, self.kind_attr, None)
if (existing_fields is not None and
existing_fields != list(self.values)):
raise ValueError("appended items do not match existing items"
" in table!")
existing_dtype = getattr(self.attrs, self.dtype_attr, None)
if (existing_dtype is not None and
existing_dtype != self.dtype):
raise ValueError("appended items dtype do not match existing "
"items dtype in table!")
def convert(self, values, nan_rep, encoding, errors):
"""set the data from this selection (and convert to the correct dtype
if we can)
"""
# values is a recarray
if values.dtype.fields is not None:
values = values[self.cname]
self.set_data(values)
# use the meta if needed
meta = _ensure_decoded(self.meta)
# convert to the correct dtype
if self.dtype is not None:
dtype = _ensure_decoded(self.dtype)
# reverse converts
if dtype == u('datetime64'):
# recreate with tz if indicated
self.data = _set_tz(self.data, self.tz, coerce=True)
elif dtype == u('timedelta64'):
self.data = np.asarray(self.data, dtype='m8[ns]')
elif dtype == u('date'):
try:
self.data = np.asarray(
[date.fromordinal(v) for v in self.data], dtype=object)
except ValueError:
self.data = np.asarray(
[date.fromtimestamp(v) for v in self.data],
dtype=object)
elif dtype == u('datetime'):
self.data = np.asarray(
[datetime.fromtimestamp(v) for v in self.data],
dtype=object)
elif meta == u('category'):
# we have a categorical
categories = self.metadata
codes = self.data.ravel()
# if we have stored a NaN in the categories
# then strip it; in theory we could have BOTH
# -1s in the codes and nulls :<
if categories is None:
# Handle case of NaN-only categorical columns in which case
# the categories are an empty array; when this is stored,
# pytables cannot write a zero-len array, so on readback
# the categories would be None and `read_hdf()` would fail.
categories = Index([], dtype=np.float64)
else:
mask = isna(categories)
if mask.any():
categories = categories[~mask]
codes[codes != -1] -= mask.astype(int).cumsum().values
self.data = Categorical.from_codes(codes,
categories=categories,
ordered=self.ordered)
else:
try:
self.data = self.data.astype(dtype, copy=False)
except:
self.data = self.data.astype('O', copy=False)
# convert nans / decode
if _ensure_decoded(self.kind) == u('string'):
self.data = _unconvert_string_array(
self.data, nan_rep=nan_rep, encoding=encoding, errors=errors)
return self
def get_attr(self):
""" get the data for this column """
self.values = getattr(self.attrs, self.kind_attr, None)
self.dtype = getattr(self.attrs, self.dtype_attr, None)
self.meta = getattr(self.attrs, self.meta_attr, None)
self.set_kind()
def set_attr(self):
""" set the data for this column """
setattr(self.attrs, self.kind_attr, self.values)
setattr(self.attrs, self.meta_attr, self.meta)
if self.dtype is not None:
setattr(self.attrs, self.dtype_attr, self.dtype)
class DataIndexableCol(DataCol):
""" represent a data column that can be indexed """
is_data_indexable = True
def validate_names(self):
if not Index(self.values).is_object():
raise ValueError("cannot have non-object label DataIndexableCol")
def get_atom_string(self, block, itemsize):
return _tables().StringCol(itemsize=itemsize)
def get_atom_data(self, block, kind=None):
return self.get_atom_coltype(kind=kind)()
def get_atom_datetime64(self, block):
return _tables().Int64Col()
def get_atom_timedelta64(self, block):
return _tables().Int64Col()
class GenericDataIndexableCol(DataIndexableCol):
""" represent a generic pytables data column """
def get_attr(self):
pass
class Fixed(StringMixin):
""" represent an object in my store
facilitate read/write of various types of objects
this is an abstract base class
Parameters
----------
parent : my parent HDFStore
group : the group node where the table resides
"""
pandas_kind = None
obj_type = None
ndim = None
is_table = False
def __init__(self, parent, group, encoding=None, errors='strict',
**kwargs):
self.parent = parent
self.group = group
self.encoding = _ensure_encoding(encoding)
self.errors = errors
self.set_version()
@property
def is_old_version(self):
return (self.version[0] <= 0 and self.version[1] <= 10 and
self.version[2] < 1)
def set_version(self):
""" compute and set our version """
version = _ensure_decoded(
getattr(self.group._v_attrs, 'pandas_version', None))
try:
self.version = tuple(int(x) for x in version.split('.'))
if len(self.version) == 2:
self.version = self.version + (0,)
except:
self.version = (0, 0, 0)
@property
def pandas_type(self):
return _ensure_decoded(getattr(self.group._v_attrs,
'pandas_type', None))
@property
def format_type(self):
return 'fixed'
def __unicode__(self):
""" return a pretty representation of myself """
self.infer_axes()
s = self.shape
if s is not None:
if isinstance(s, (list, tuple)):
s = "[%s]" % ','.join(pprint_thing(x) for x in s)
return "%-12.12s (shape->%s)" % (self.pandas_type, s)
return self.pandas_type
def set_object_info(self):
""" set my pandas type & version """
self.attrs.pandas_type = str(self.pandas_kind)
self.attrs.pandas_version = str(_version)
self.set_version()
def copy(self):
new_self = copy.copy(self)
return new_self
@property
def storage_obj_type(self):
return self.obj_type
@property
def shape(self):
return self.nrows
@property
def pathname(self):
return self.group._v_pathname
@property
def _handle(self):
return self.parent._handle
@property
def _filters(self):
return self.parent._filters
@property
def _complevel(self):
return self.parent._complevel
@property
def _fletcher32(self):
return self.parent._fletcher32
@property
def _complib(self):
return self.parent._complib
@property
def attrs(self):
return self.group._v_attrs
def set_attrs(self):
""" set our object attributes """
pass
def get_attrs(self):
""" get our object attributes """
pass
@property
def storable(self):
""" return my storable """
return self.group
@property
def is_exists(self):
return False
@property
def nrows(self):
return getattr(self.storable, 'nrows', None)
def validate(self, other):
""" validate against an existing storable """
if other is None:
return
return True
def validate_version(self, where=None):
""" are we trying to operate on an old version? """
return True
def infer_axes(self):
""" infer the axes of my storer
return a boolean indicating if we have a valid storer or not """
s = self.storable
if s is None:
return False
self.get_attrs()
return True
def read(self, **kwargs):
raise NotImplementedError(
"cannot read on an abstract storer: subclasses should implement")
def write(self, **kwargs):
raise NotImplementedError(
"cannot write on an abstract storer: sublcasses should implement")
def delete(self, where=None, start=None, stop=None, **kwargs):
"""
support fully deleting the node in its entirety (only) - where
specification must be None
"""
if com._all_none(where, start, stop):
self._handle.remove_node(self.group, recursive=True)
return None
raise TypeError("cannot delete on an abstract storer")
class GenericFixed(Fixed):
""" a generified fixed version """
_index_type_map = {DatetimeIndex: 'datetime', PeriodIndex: 'period'}
_reverse_index_map = {v: k for k, v in compat.iteritems(_index_type_map)}
attributes = []
# indexer helpders
def _class_to_alias(self, cls):
return self._index_type_map.get(cls, '')
def _alias_to_class(self, alias):
if isinstance(alias, type): # pragma: no cover
# compat: for a short period of time master stored types
return alias
return self._reverse_index_map.get(alias, Index)
def _get_index_factory(self, klass):
if klass == DatetimeIndex:
def f(values, freq=None, tz=None):
# data are already in UTC, localize and convert if tz present
result = DatetimeIndex._simple_new(values, None, freq=freq)
if tz is not None:
result = result.tz_localize('UTC').tz_convert(tz)
return result
return f
elif klass == PeriodIndex:
def f(values, freq=None, tz=None):
return PeriodIndex._simple_new(values, None, freq=freq)
return f
return klass
def validate_read(self, kwargs):
"""
remove table keywords from kwargs and return
raise if any keywords are passed which are not-None
"""
kwargs = copy.copy(kwargs)
columns = kwargs.pop('columns', None)
if columns is not None:
raise TypeError("cannot pass a column specification when reading "
"a Fixed format store. this store must be "
"selected in its entirety")
where = kwargs.pop('where', None)
if where is not None:
raise TypeError("cannot pass a where specification when reading "
"from a Fixed format store. this store must be "
"selected in its entirety")
return kwargs
@property
def is_exists(self):
return True
def set_attrs(self):
""" set our object attributes """
self.attrs.encoding = self.encoding
self.attrs.errors = self.errors
def get_attrs(self):
""" retrieve our attributes """
self.encoding = _ensure_encoding(getattr(self.attrs, 'encoding', None))
self.errors = getattr(self.attrs, 'errors', 'strict')
for n in self.attributes:
setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
def write(self, obj, **kwargs):
self.set_attrs()
def read_array(self, key, start=None, stop=None):
""" read an array for the specified node (off of group """
import tables
node = getattr(self.group, key)
attrs = node._v_attrs
transposed = getattr(attrs, 'transposed', False)
if isinstance(node, tables.VLArray):
ret = node[0][start:stop]
else:
dtype = getattr(attrs, 'value_type', None)
shape = getattr(attrs, 'shape', None)
if shape is not None:
# length 0 axis
ret = np.empty(shape, dtype=dtype)
else:
ret = node[start:stop]
if dtype == u('datetime64'):
# reconstruct a timezone if indicated
ret = _set_tz(ret, getattr(attrs, 'tz', None), coerce=True)
elif dtype == u('timedelta64'):
ret = np.asarray(ret, dtype='m8[ns]')
if transposed:
return ret.T
else:
return ret
def read_index(self, key, **kwargs):
variety = _ensure_decoded(getattr(self.attrs, '%s_variety' % key))
if variety == u('multi'):
return self.read_multi_index(key, **kwargs)
elif variety == u('block'):
return self.read_block_index(key, **kwargs)
elif variety == u('sparseint'):
return self.read_sparse_intindex(key, **kwargs)
elif variety == u('regular'):
_, index = self.read_index_node(getattr(self.group, key), **kwargs)
return index
else: # pragma: no cover
raise TypeError('unrecognized index variety: %s' % variety)
def write_index(self, key, index):
if isinstance(index, MultiIndex):
setattr(self.attrs, '%s_variety' % key, 'multi')
self.write_multi_index(key, index)
elif isinstance(index, BlockIndex):
setattr(self.attrs, '%s_variety' % key, 'block')
self.write_block_index(key, index)
elif isinstance(index, IntIndex):
setattr(self.attrs, '%s_variety' % key, 'sparseint')
self.write_sparse_intindex(key, index)
else:
setattr(self.attrs, '%s_variety' % key, 'regular')
converted = _convert_index(index, self.encoding, self.errors,
self.format_type).set_name('index')
self.write_array(key, converted.values)
node = getattr(self.group, key)
node._v_attrs.kind = converted.kind
node._v_attrs.name = index.name
if isinstance(index, (DatetimeIndex, PeriodIndex)):
node._v_attrs.index_class = self._class_to_alias(type(index))
if hasattr(index, 'freq'):
node._v_attrs.freq = index.freq
if hasattr(index, 'tz') and index.tz is not None:
node._v_attrs.tz = _get_tz(index.tz)
def write_block_index(self, key, index):
self.write_array('%s_blocs' % key, index.blocs)
self.write_array('%s_blengths' % key, index.blengths)
setattr(self.attrs, '%s_length' % key, index.length)
def read_block_index(self, key, **kwargs):
length = getattr(self.attrs, '%s_length' % key)
blocs = self.read_array('%s_blocs' % key, **kwargs)
blengths = self.read_array('%s_blengths' % key, **kwargs)
return BlockIndex(length, blocs, blengths)
def write_sparse_intindex(self, key, index):
self.write_array('%s_indices' % key, index.indices)
setattr(self.attrs, '%s_length' % key, index.length)
def read_sparse_intindex(self, key, **kwargs):
length = getattr(self.attrs, '%s_length' % key)
indices = self.read_array('%s_indices' % key, **kwargs)
return IntIndex(length, indices)
def write_multi_index(self, key, index):
setattr(self.attrs, '%s_nlevels' % key, index.nlevels)
for i, (lev, lab, name) in enumerate(zip(index.levels,
index.labels,
index.names)):
# write the level
level_key = '%s_level%d' % (key, i)
conv_level = _convert_index(lev, self.encoding, self.errors,
self.format_type).set_name(level_key)
self.write_array(level_key, conv_level.values)
node = getattr(self.group, level_key)
node._v_attrs.kind = conv_level.kind
node._v_attrs.name = name
# write the name
setattr(node._v_attrs, '%s_name%d' % (key, i), name)
# write the labels
label_key = '%s_label%d' % (key, i)
self.write_array(label_key, lab)
def read_multi_index(self, key, **kwargs):
nlevels = getattr(self.attrs, '%s_nlevels' % key)
levels = []
labels = []
names = []
for i in range(nlevels):
level_key = '%s_level%d' % (key, i)
name, lev = self.read_index_node(getattr(self.group, level_key),
**kwargs)
levels.append(lev)
names.append(name)
label_key = '%s_label%d' % (key, i)
lab = self.read_array(label_key, **kwargs)
labels.append(lab)
return MultiIndex(levels=levels, labels=labels, names=names,
verify_integrity=True)
def read_index_node(self, node, start=None, stop=None):
data = node[start:stop]
# If the index was an empty array write_array_empty() will
# have written a sentinel. Here we relace it with the original.
if ('shape' in node._v_attrs and
self._is_empty_array(getattr(node._v_attrs, 'shape'))):
data = np.empty(getattr(node._v_attrs, 'shape'),
dtype=getattr(node._v_attrs, 'value_type'))
kind = _ensure_decoded(node._v_attrs.kind)
name = None
if 'name' in node._v_attrs:
name = _ensure_str(node._v_attrs.name)
index_class = self._alias_to_class(_ensure_decoded(
getattr(node._v_attrs, 'index_class', '')))
factory = self._get_index_factory(index_class)
kwargs = {}
if u('freq') in node._v_attrs:
kwargs['freq'] = node._v_attrs['freq']
if u('tz') in node._v_attrs:
kwargs['tz'] = node._v_attrs['tz']
if kind in (u('date'), u('datetime')):
index = factory(_unconvert_index(data, kind,
encoding=self.encoding,
errors=self.errors),
dtype=object, **kwargs)
else:
index = factory(_unconvert_index(data, kind,
encoding=self.encoding,
errors=self.errors), **kwargs)
index.name = name
return name, index
def write_array_empty(self, key, value):
""" write a 0-len array """
# ugly hack for length 0 axes
arr = np.empty((1,) * value.ndim)
self._handle.create_array(self.group, key, arr)
getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
getattr(self.group, key)._v_attrs.shape = value.shape
def _is_empty_array(self, shape):
"""Returns true if any axis is zero length."""
return any(x == 0 for x in shape)
def write_array(self, key, value, items=None):
if key in self.group:
self._handle.remove_node(self.group, key)
# Transform needed to interface with pytables row/col notation
empty_array = self._is_empty_array(value.shape)
transposed = False
if is_categorical_dtype(value):
raise NotImplementedError('Cannot store a category dtype in '
'a HDF5 dataset that uses format='
'"fixed". Use format="table".')
if not empty_array:
value = value.T
transposed = True
if self._filters is not None:
atom = None
try:
# get the atom for this datatype
atom = _tables().Atom.from_dtype(value.dtype)
except ValueError:
pass
if atom is not None:
# create an empty chunked array and fill it from value
if not empty_array:
ca = self._handle.create_carray(self.group, key, atom,
value.shape,
filters=self._filters)
ca[:] = value
getattr(self.group, key)._v_attrs.transposed = transposed
else:
self.write_array_empty(key, value)
return
if value.dtype.type == np.object_:
# infer the type, warn if we have a non-string type here (for
# performance)
inferred_type = lib.infer_dtype(value.ravel())
if empty_array:
pass
elif inferred_type == 'string':
pass
else:
try:
items = list(items)
except:
pass
ws = performance_doc % (inferred_type, key, items)
warnings.warn(ws, PerformanceWarning, stacklevel=7)
vlarr = self._handle.create_vlarray(self.group, key,
_tables().ObjectAtom())
vlarr.append(value)
else:
if empty_array:
self.write_array_empty(key, value)
else:
if is_datetime64_dtype(value.dtype):
self._handle.create_array(
self.group, key, value.view('i8'))
getattr(
self.group, key)._v_attrs.value_type = 'datetime64'
elif is_datetime64tz_dtype(value.dtype):
# store as UTC
# with a zone
self._handle.create_array(self.group, key,
value.asi8)
node = getattr(self.group, key)
node._v_attrs.tz = _get_tz(value.tz)
node._v_attrs.value_type = 'datetime64'
elif is_timedelta64_dtype(value.dtype):
self._handle.create_array(
self.group, key, value.view('i8'))
getattr(
self.group, key)._v_attrs.value_type = 'timedelta64'
else:
self._handle.create_array(self.group, key, value)
getattr(self.group, key)._v_attrs.transposed = transposed
class LegacyFixed(GenericFixed):
def read_index_legacy(self, key, start=None, stop=None):
node = getattr(self.group, key)
data = node[start:stop]
kind = node._v_attrs.kind
return _unconvert_index_legacy(data, kind, encoding=self.encoding,
errors=self.errors)
class LegacySeriesFixed(LegacyFixed):
def read(self, **kwargs):
kwargs = self.validate_read(kwargs)
index = self.read_index_legacy('index')
values = self.read_array('values')
return Series(values, index=index)
class LegacyFrameFixed(LegacyFixed):
def read(self, **kwargs):
kwargs = self.validate_read(kwargs)
index = self.read_index_legacy('index')
columns = self.read_index_legacy('columns')
values = self.read_array('values')
return DataFrame(values, index=index, columns=columns)
class SeriesFixed(GenericFixed):
pandas_kind = u('series')
attributes = ['name']
@property
def shape(self):
try:
return len(getattr(self.group, 'values')),
except:
return None
def read(self, **kwargs):
kwargs = self.validate_read(kwargs)
index = self.read_index('index', **kwargs)
values = self.read_array('values', **kwargs)
return Series(values, index=index, name=self.name)
def write(self, obj, **kwargs):
super(SeriesFixed, self).write(obj, **kwargs)
self.write_index('index', obj.index)
self.write_array('values', obj.values)
self.attrs.name = obj.name
class SparseFixed(GenericFixed):
def validate_read(self, kwargs):
"""
we don't support start, stop kwds in Sparse
"""
kwargs = super(SparseFixed, self).validate_read(kwargs)
if 'start' in kwargs or 'stop' in kwargs:
raise NotImplementedError("start and/or stop are not supported "
"in fixed Sparse reading")
return kwargs
class SparseSeriesFixed(SparseFixed):
pandas_kind = u('sparse_series')
attributes = ['name', 'fill_value', 'kind']
def read(self, **kwargs):
kwargs = self.validate_read(kwargs)
index = self.read_index('index')
sp_values = self.read_array('sp_values')
sp_index = self.read_index('sp_index')
return SparseSeries(sp_values, index=index, sparse_index=sp_index,
kind=self.kind or u('block'),
fill_value=self.fill_value,
name=self.name)
def write(self, obj, **kwargs):
super(SparseSeriesFixed, self).write(obj, **kwargs)
self.write_index('index', obj.index)
self.write_index('sp_index', obj.sp_index)
self.write_array('sp_values', obj.sp_values)
self.attrs.name = obj.name
self.attrs.fill_value = obj.fill_value
self.attrs.kind = obj.kind
class SparseFrameFixed(SparseFixed):
pandas_kind = u('sparse_frame')
attributes = ['default_kind', 'default_fill_value']
def read(self, **kwargs):
kwargs = self.validate_read(kwargs)
columns = self.read_index('columns')
sdict = {}
for c in columns:
key = 'sparse_series_%s' % c
s = SparseSeriesFixed(self.parent, getattr(self.group, key))
s.infer_axes()
sdict[c] = s.read()
return SparseDataFrame(sdict, columns=columns,
default_kind=self.default_kind,
default_fill_value=self.default_fill_value)
def write(self, obj, **kwargs):
""" write it as a collection of individual sparse series """
super(SparseFrameFixed, self).write(obj, **kwargs)
for name, ss in compat.iteritems(obj):
key = 'sparse_series_%s' % name
if key not in self.group._v_children:
node = self._handle.create_group(self.group, key)
else:
node = getattr(self.group, key)
s = SparseSeriesFixed(self.parent, node)
s.write(ss)
self.attrs.default_fill_value = obj.default_fill_value
self.attrs.default_kind = obj.default_kind
self.write_index('columns', obj.columns)
class BlockManagerFixed(GenericFixed):
attributes = ['ndim', 'nblocks']
is_shape_reversed = False
@property
def shape(self):
try:
ndim = self.ndim
# items
items = 0
for i in range(self.nblocks):
node = getattr(self.group, 'block%d_items' % i)
shape = getattr(node, 'shape', None)
if shape is not None:
items += shape[0]
# data shape
node = getattr(self.group, 'block0_values')
shape = getattr(node, 'shape', None)
if shape is not None:
shape = list(shape[0:(ndim - 1)])
else:
shape = []
shape.append(items)
# hacky - this works for frames, but is reversed for panels
if self.is_shape_reversed:
shape = shape[::-1]
return shape
except:
return None
def read(self, start=None, stop=None, **kwargs):
# start, stop applied to rows, so 0th axis only
kwargs = self.validate_read(kwargs)
select_axis = self.obj_type()._get_block_manager_axis(0)
axes = []
for i in range(self.ndim):
_start, _stop = (start, stop) if i == select_axis else (None, None)
ax = self.read_index('axis%d' % i, start=_start, stop=_stop)
axes.append(ax)
items = axes[0]
blocks = []
for i in range(self.nblocks):
blk_items = self.read_index('block%d_items' % i)
values = self.read_array('block%d_values' % i,
start=_start, stop=_stop)
blk = make_block(values,
placement=items.get_indexer(blk_items))
blocks.append(blk)
return self.obj_type(BlockManager(blocks, axes))
def write(self, obj, **kwargs):
super(BlockManagerFixed, self).write(obj, **kwargs)
data = obj._data
if not data.is_consolidated():
data = data.consolidate()
self.attrs.ndim = data.ndim
for i, ax in enumerate(data.axes):
if i == 0:
if not ax.is_unique:
raise ValueError(
"Columns index has to be unique for fixed format")
self.write_index('axis%d' % i, ax)
# Supporting mixed-type DataFrame objects...nontrivial
self.attrs.nblocks = len(data.blocks)
for i, blk in enumerate(data.blocks):
# I have no idea why, but writing values before items fixed #2299
blk_items = data.items.take(blk.mgr_locs)
self.write_array('block%d_values' % i, blk.values, items=blk_items)
self.write_index('block%d_items' % i, blk_items)
class FrameFixed(BlockManagerFixed):
pandas_kind = u('frame')
obj_type = DataFrame
class PanelFixed(BlockManagerFixed):
pandas_kind = u('wide')
obj_type = Panel
is_shape_reversed = True
def write(self, obj, **kwargs):
obj._consolidate_inplace()
return super(PanelFixed, self).write(obj, **kwargs)
class Table(Fixed):
""" represent a table:
facilitate read/write of various types of tables
Attrs in Table Node
-------------------
These are attributes that are store in the main table node, they are
necessary to recreate these tables when read back in.
index_axes : a list of tuples of the (original indexing axis and
index column)
non_index_axes: a list of tuples of the (original index axis and
columns on a non-indexing axis)
values_axes : a list of the columns which comprise the data of this
table
data_columns : a list of the columns that we are allowing indexing
(these become single columns in values_axes), or True to force all
columns
nan_rep : the string to use for nan representations for string
objects
levels : the names of levels
metadata : the names of the metadata columns
"""
pandas_kind = u('wide_table')
table_type = None
levels = 1
is_table = True
is_shape_reversed = False
def __init__(self, *args, **kwargs):
super(Table, self).__init__(*args, **kwargs)
self.index_axes = []
self.non_index_axes = []
self.values_axes = []
self.data_columns = []
self.metadata = []
self.info = dict()
self.nan_rep = None
self.selection = None
@property
def table_type_short(self):
return self.table_type.split('_')[0]
@property
def format_type(self):
return 'table'
def __unicode__(self):
""" return a pretty representatgion of myself """
self.infer_axes()
dc = ",dc->[%s]" % ','.join(
self.data_columns) if len(self.data_columns) else ''
ver = ''
if self.is_old_version:
ver = "[%s]" % '.'.join(str(x) for x in self.version)
return "%-12.12s%s (typ->%s,nrows->%s,ncols->%s,indexers->[%s]%s)" % (
self.pandas_type, ver, self.table_type_short, self.nrows,
self.ncols, ','.join(a.name for a in self.index_axes), dc
)
def __getitem__(self, c):
""" return the axis for c """
for a in self.axes:
if c == a.name:
return a
return None
def validate(self, other):
""" validate against an existing table """
if other is None:
return
if other.table_type != self.table_type:
raise TypeError("incompatible table_type with existing [%s - %s]" %
(other.table_type, self.table_type))
for c in ['index_axes', 'non_index_axes', 'values_axes']:
sv = getattr(self, c, None)
ov = getattr(other, c, None)
if sv != ov:
# show the error for the specific axes
for i, sax in enumerate(sv):
oax = ov[i]
if sax != oax:
raise ValueError(
"invalid combinate of [%s] on appending data [%s] "
"vs current table [%s]" % (c, sax, oax))
# should never get here
raise Exception(
"invalid combinate of [%s] on appending data [%s] vs "
"current table [%s]" % (c, sv, ov))
@property
def is_multi_index(self):
"""the levels attribute is 1 or a list in the case of a multi-index"""
return isinstance(self.levels, list)
def validate_metadata(self, existing):
""" create / validate metadata """
self.metadata = [
c.name for c in self.values_axes if c.metadata is not None]
def validate_multiindex(self, obj):
"""validate that we can store the multi-index; reset and return the
new object
"""
levels = [l if l is not None else "level_{0}".format(i)
for i, l in enumerate(obj.index.names)]
try:
return obj.reset_index(), levels
except ValueError:
raise ValueError("duplicate names/columns in the multi-index when "
"storing as a table")
@property
def nrows_expected(self):
""" based on our axes, compute the expected nrows """
return np.prod([i.cvalues.shape[0] for i in self.index_axes])
@property
def is_exists(self):
""" has this table been created """
return u('table') in self.group
@property
def storable(self):
return getattr(self.group, 'table', None)
@property
def table(self):
""" return the table group (this is my storable) """
return self.storable
@property
def dtype(self):
return self.table.dtype
@property
def description(self):
return self.table.description
@property
def axes(self):
return itertools.chain(self.index_axes, self.values_axes)
@property
def ncols(self):
""" the number of total columns in the values axes """
return sum(len(a.values) for a in self.values_axes)
@property
def is_transposed(self):
return False
@property
def data_orientation(self):
"""return a tuple of my permutated axes, non_indexable at the front"""
return tuple(itertools.chain([int(a[0]) for a in self.non_index_axes],
[int(a.axis) for a in self.index_axes]))
def queryables(self):
""" return a dict of the kinds allowable columns for this object """
# compute the values_axes queryables
return dict(
[(a.cname, a) for a in self.index_axes] +
[(self.storage_obj_type._AXIS_NAMES[axis], None)
for axis, values in self.non_index_axes] +
[(v.cname, v) for v in self.values_axes
if v.name in set(self.data_columns)]
)
def index_cols(self):
""" return a list of my index cols """
return [(i.axis, i.cname) for i in self.index_axes]
def values_cols(self):
""" return a list of my values cols """
return [i.cname for i in self.values_axes]
def _get_metadata_path(self, key):
""" return the metadata pathname for this key """
return "{group}/meta/{key}/meta".format(group=self.group._v_pathname,
key=key)
def write_metadata(self, key, values):
"""
write out a meta data array to the key as a fixed-format Series
Parameters
----------
key : string
values : ndarray
"""
values = Series(values)
self.parent.put(self._get_metadata_path(key), values, format='table',
encoding=self.encoding, errors=self.errors,
nan_rep=self.nan_rep)
def read_metadata(self, key):
""" return the meta data array for this key """
if getattr(getattr(self.group, 'meta', None), key, None) is not None:
return self.parent.select(self._get_metadata_path(key))
return None
def set_info(self):
""" update our table index info """
self.attrs.info = self.info
def set_attrs(self):
""" set our table type & indexables """
self.attrs.table_type = str(self.table_type)
self.attrs.index_cols = self.index_cols()
self.attrs.values_cols = self.values_cols()
self.attrs.non_index_axes = self.non_index_axes
self.attrs.data_columns = self.data_columns
self.attrs.nan_rep = self.nan_rep
self.attrs.encoding = self.encoding
self.attrs.errors = self.errors
self.attrs.levels = self.levels
self.attrs.metadata = self.metadata
self.set_info()
def get_attrs(self):
""" retrieve our attributes """
self.non_index_axes = getattr(
self.attrs, 'non_index_axes', None) or []
self.data_columns = getattr(
self.attrs, 'data_columns', None) or []
self.info = getattr(
self.attrs, 'info', None) or dict()
self.nan_rep = getattr(self.attrs, 'nan_rep', None)
self.encoding = _ensure_encoding(
getattr(self.attrs, 'encoding', None))
self.errors = getattr(self.attrs, 'errors', 'strict')
self.levels = getattr(
self.attrs, 'levels', None) or []
self.index_axes = [
a.infer(self) for a in self.indexables if a.is_an_indexable
]
self.values_axes = [
a.infer(self) for a in self.indexables if not a.is_an_indexable
]
self.metadata = getattr(
self.attrs, 'metadata', None) or []
def validate_version(self, where=None):
""" are we trying to operate on an old version? """
if where is not None:
if (self.version[0] <= 0 and self.version[1] <= 10 and
self.version[2] < 1):
ws = incompatibility_doc % '.'.join(
[str(x) for x in self.version])
warnings.warn(ws, IncompatibilityWarning)
def validate_min_itemsize(self, min_itemsize):
"""validate the min_itemisze doesn't contain items that are not in the
axes this needs data_columns to be defined
"""
if min_itemsize is None:
return
if not isinstance(min_itemsize, dict):
return
q = self.queryables()
for k, v in min_itemsize.items():
# ok, apply generally
if k == 'values':
continue
if k not in q:
raise ValueError(
"min_itemsize has the key [%s] which is not an axis or "
"data_column" % k)
@property
def indexables(self):
""" create/cache the indexables if they don't exist """
if self._indexables is None:
self._indexables = []
# index columns
self._indexables.extend([
IndexCol(name=name, axis=axis, pos=i)
for i, (axis, name) in enumerate(self.attrs.index_cols)
])
# values columns
dc = set(self.data_columns)
base_pos = len(self._indexables)
def f(i, c):
klass = DataCol
if c in dc:
klass = DataIndexableCol
return klass.create_for_block(i=i, name=c, pos=base_pos + i,
version=self.version)
self._indexables.extend(
[f(i, c) for i, c in enumerate(self.attrs.values_cols)])
return self._indexables
def create_index(self, columns=None, optlevel=None, kind=None):
"""
Create a pytables index on the specified columns
note: cannot index Time64Col() or ComplexCol currently;
PyTables must be >= 3.0
Parameters
----------
columns : False (don't create an index), True (create all columns
index), None or list_like (the indexers to index)
optlevel: optimization level (defaults to 6)
kind : kind of index (defaults to 'medium')
Exceptions
----------
raises if the node is not a table
"""
if not self.infer_axes():
return
if columns is False:
return
# index all indexables and data_columns
if columns is None or columns is True:
columns = [a.cname for a in self.axes if a.is_data_indexable]
if not isinstance(columns, (tuple, list)):
columns = [columns]
kw = dict()
if optlevel is not None:
kw['optlevel'] = optlevel
if kind is not None:
kw['kind'] = kind
table = self.table
for c in columns:
v = getattr(table.cols, c, None)
if v is not None:
# remove the index if the kind/optlevel have changed
if v.is_indexed:
index = v.index
cur_optlevel = index.optlevel
cur_kind = index.kind
if kind is not None and cur_kind != kind:
v.remove_index()
else:
kw['kind'] = cur_kind
if optlevel is not None and cur_optlevel != optlevel:
v.remove_index()
else:
kw['optlevel'] = cur_optlevel
# create the index
if not v.is_indexed:
if v.type.startswith('complex'):
raise TypeError(
'Columns containing complex values can be stored '
'but cannot'
' be indexed when using table format. Either use '
'fixed format, set index=False, or do not include '
'the columns containing complex values to '
'data_columns when initializing the table.')
v.create_index(**kw)
def read_axes(self, where, **kwargs):
"""create and return the axes sniffed from the table: return boolean
for success
"""
# validate the version
self.validate_version(where)
# infer the data kind
if not self.infer_axes():
return False
# create the selection
self.selection = Selection(self, where=where, **kwargs)
values = self.selection.select()
# convert the data
for a in self.axes:
a.set_info(self.info)
a.convert(values, nan_rep=self.nan_rep, encoding=self.encoding,
errors=self.errors)
return True
def get_object(self, obj):
""" return the data for this obj """
return obj
def validate_data_columns(self, data_columns, min_itemsize):
"""take the input data_columns and min_itemize and create a data
columns spec
"""
if not len(self.non_index_axes):
return []
axis, axis_labels = self.non_index_axes[0]
info = self.info.get(axis, dict())
if info.get('type') == 'MultiIndex' and data_columns:
raise ValueError("cannot use a multi-index on axis [{0}] with "
"data_columns {1}".format(axis, data_columns))
# evaluate the passed data_columns, True == use all columns
# take only valide axis labels
if data_columns is True:
data_columns = list(axis_labels)
elif data_columns is None:
data_columns = []
# if min_itemsize is a dict, add the keys (exclude 'values')
if isinstance(min_itemsize, dict):
existing_data_columns = set(data_columns)
data_columns.extend([
k for k in min_itemsize.keys()
if k != 'values' and k not in existing_data_columns
])
# return valid columns in the order of our axis
return [c for c in data_columns if c in axis_labels]
def create_axes(self, axes, obj, validate=True, nan_rep=None,
data_columns=None, min_itemsize=None, **kwargs):
""" create and return the axes
leagcy tables create an indexable column, indexable index,
non-indexable fields
Parameters:
-----------
axes: a list of the axes in order to create (names or numbers of
the axes)
obj : the object to create axes on
validate: validate the obj against an existing object already
written
min_itemsize: a dict of the min size for a column in bytes
nan_rep : a values to use for string column nan_rep
encoding : the encoding for string values
data_columns : a list of columns that we want to create separate to
allow indexing (or True will force all columns)
"""
# set the default axes if needed
if axes is None:
try:
axes = _AXES_MAP[type(obj)]
except:
raise TypeError("cannot properly create the storer for: "
"[group->%s,value->%s]"
% (self.group._v_name, type(obj)))
# map axes to numbers
axes = [obj._get_axis_number(a) for a in axes]
# do we have an existing table (if so, use its axes & data_columns)
if self.infer_axes():
existing_table = self.copy()
existing_table.infer_axes()
axes = [a.axis for a in existing_table.index_axes]
data_columns = existing_table.data_columns
nan_rep = existing_table.nan_rep
self.encoding = existing_table.encoding
self.errors = existing_table.errors
self.info = copy.copy(existing_table.info)
else:
existing_table = None
# currently support on ndim-1 axes
if len(axes) != self.ndim - 1:
raise ValueError(
"currently only support ndim-1 indexers in an AppendableTable")
# create according to the new data
self.non_index_axes = []
self.data_columns = []
# nan_representation
if nan_rep is None:
nan_rep = 'nan'
self.nan_rep = nan_rep
# create axes to index and non_index
index_axes_map = dict()
for i, a in enumerate(obj.axes):
if i in axes:
name = obj._AXIS_NAMES[i]
index_axes_map[i] = _convert_index(
a, self.encoding, self.errors, self.format_type
).set_name(name).set_axis(i)
else:
# we might be able to change the axes on the appending data if
# necessary
append_axis = list(a)
if existing_table is not None:
indexer = len(self.non_index_axes)
exist_axis = existing_table.non_index_axes[indexer][1]
if not array_equivalent(np.array(append_axis),
np.array(exist_axis)):
# ahah! -> reindex
if array_equivalent(np.array(sorted(append_axis)),
np.array(sorted(exist_axis))):
append_axis = exist_axis
# the non_index_axes info
info = _get_info(self.info, i)
info['names'] = list(a.names)
info['type'] = a.__class__.__name__
self.non_index_axes.append((i, append_axis))
# set axis positions (based on the axes)
self.index_axes = [
index_axes_map[a].set_pos(j).update_info(self.info)
for j, a in enumerate(axes)
]
j = len(self.index_axes)
# check for column conflicts
for a in self.axes:
a.maybe_set_size(min_itemsize=min_itemsize)
# reindex by our non_index_axes & compute data_columns
for a in self.non_index_axes:
obj = _reindex_axis(obj, a[0], a[1])
def get_blk_items(mgr, blocks):
return [mgr.items.take(blk.mgr_locs) for blk in blocks]
# figure out data_columns and get out blocks
block_obj = self.get_object(obj)._consolidate()
blocks = block_obj._data.blocks
blk_items = get_blk_items(block_obj._data, blocks)
if len(self.non_index_axes):
axis, axis_labels = self.non_index_axes[0]
data_columns = self.validate_data_columns(
data_columns, min_itemsize)
if len(data_columns):
mgr = block_obj.reindex(
Index(axis_labels).difference(Index(data_columns)),
axis=axis
)._data
blocks = list(mgr.blocks)
blk_items = get_blk_items(mgr, blocks)
for c in data_columns:
mgr = block_obj.reindex([c], axis=axis)._data
blocks.extend(mgr.blocks)
blk_items.extend(get_blk_items(mgr, mgr.blocks))
# reorder the blocks in the same order as the existing_table if we can
if existing_table is not None:
by_items = {tuple(b_items.tolist()): (b, b_items)
for b, b_items in zip(blocks, blk_items)}
new_blocks = []
new_blk_items = []
for ea in existing_table.values_axes:
items = tuple(ea.values)
try:
b, b_items = by_items.pop(items)
new_blocks.append(b)
new_blk_items.append(b_items)
except:
raise ValueError(
"cannot match existing table structure for [%s] on "
"appending data" % ','.join(pprint_thing(item) for
item in items))
blocks = new_blocks
blk_items = new_blk_items
# add my values
self.values_axes = []
for i, (b, b_items) in enumerate(zip(blocks, blk_items)):
# shape of the data column are the indexable axes
klass = DataCol
name = None
# we have a data_column
if (data_columns and len(b_items) == 1 and
b_items[0] in data_columns):
klass = DataIndexableCol
name = b_items[0]
self.data_columns.append(name)
# make sure that we match up the existing columns
# if we have an existing table
if existing_table is not None and validate:
try:
existing_col = existing_table.values_axes[i]
except:
raise ValueError("Incompatible appended table [%s] with "
"existing table [%s]"
% (blocks, existing_table.values_axes))
else:
existing_col = None
try:
col = klass.create_for_block(
i=i, name=name, version=self.version)
col.set_atom(block=b, block_items=b_items,
existing_col=existing_col,
min_itemsize=min_itemsize,
nan_rep=nan_rep,
encoding=self.encoding,
errors=self.errors,
info=self.info)
col.set_pos(j)
self.values_axes.append(col)
except (NotImplementedError, ValueError, TypeError) as e:
raise e
except Exception as detail:
raise Exception(
"cannot find the correct atom type -> "
"[dtype->%s,items->%s] %s"
% (b.dtype.name, b_items, str(detail))
)
j += 1
# validate our min_itemsize
self.validate_min_itemsize(min_itemsize)
# validate our metadata
self.validate_metadata(existing_table)
# validate the axes if we have an existing table
if validate:
self.validate(existing_table)
def process_axes(self, obj, columns=None):
""" process axes filters """
# make a copy to avoid side effects
if columns is not None:
columns = list(columns)
# make sure to include levels if we have them
if columns is not None and self.is_multi_index:
for n in self.levels:
if n not in columns:
columns.insert(0, n)
# reorder by any non_index_axes & limit to the select columns
for axis, labels in self.non_index_axes:
obj = _reindex_axis(obj, axis, labels, columns)
# apply the selection filters (but keep in the same order)
if self.selection.filter is not None:
for field, op, filt in self.selection.filter.format():
def process_filter(field, filt):
for axis_name in obj._AXIS_NAMES.values():
axis_number = obj._get_axis_number(axis_name)
axis_values = obj._get_axis(axis_name)
# see if the field is the name of an axis
if field == axis_name:
# if we have a multi-index, then need to include
# the levels
if self.is_multi_index:
filt = filt.union(Index(self.levels))
takers = op(axis_values, filt)
return obj.loc._getitem_axis(takers,
axis=axis_number)
# this might be the name of a file IN an axis
elif field in axis_values:
# we need to filter on this dimension
values = _ensure_index(getattr(obj, field).values)
filt = _ensure_index(filt)
# hack until we support reversed dim flags
if isinstance(obj, DataFrame):
axis_number = 1 - axis_number
takers = op(values, filt)
return obj.loc._getitem_axis(takers,
axis=axis_number)
raise ValueError(
"cannot find the field [%s] for filtering!" % field)
obj = process_filter(field, filt)
return obj
def create_description(self, complib=None, complevel=None,
fletcher32=False, expectedrows=None):
""" create the description of the table from the axes & values """
# provided expected rows if its passed
if expectedrows is None:
expectedrows = max(self.nrows_expected, 10000)
d = dict(name='table', expectedrows=expectedrows)
# description from the axes & values
d['description'] = {a.cname: a.typ for a in self.axes}
if complib:
if complevel is None:
complevel = self._complevel or 9
filters = _tables().Filters(
complevel=complevel, complib=complib,
fletcher32=fletcher32 or self._fletcher32)
d['filters'] = filters
elif self._filters is not None:
d['filters'] = self._filters
return d
def read_coordinates(self, where=None, start=None, stop=None, **kwargs):
"""select coordinates (row numbers) from a table; return the
coordinates object
"""
# validate the version
self.validate_version(where)
# infer the data kind
if not self.infer_axes():
return False
# create the selection
self.selection = Selection(
self, where=where, start=start, stop=stop, **kwargs)
coords = self.selection.select_coords()
if self.selection.filter is not None:
for field, op, filt in self.selection.filter.format():
data = self.read_column(
field, start=coords.min(), stop=coords.max() + 1)
coords = coords[
op(data.iloc[coords - coords.min()], filt).values]
return Index(coords)
def read_column(self, column, where=None, start=None, stop=None, **kwargs):
"""return a single column from the table, generally only indexables
are interesting
"""
# validate the version
self.validate_version()
# infer the data kind
if not self.infer_axes():
return False
if where is not None:
raise TypeError("read_column does not currently accept a where "
"clause")
# find the axes
for a in self.axes:
if column == a.name:
if not a.is_data_indexable:
raise ValueError(
"column [%s] can not be extracted individually; it is "
"not data indexable" % column)
# column must be an indexable or a data column
c = getattr(self.table.cols, column)
a.set_info(self.info)
return Series(_set_tz(a.convert(c[start:stop],
nan_rep=self.nan_rep,
encoding=self.encoding,
errors=self.errors
).take_data(),
a.tz, True), name=column)
raise KeyError("column [%s] not found in the table" % column)
class WORMTable(Table):
""" a write-once read-many table: this format DOES NOT ALLOW appending to a
table. writing is a one-time operation the data are stored in a format
that allows for searching the data on disk
"""
table_type = u('worm')
def read(self, **kwargs):
""" read the indicies and the indexing array, calculate offset rows and
return """
raise NotImplementedError("WORMTable needs to implement read")
def write(self, **kwargs):
""" write in a format that we can search later on (but cannot append
to): write out the indicies and the values using _write_array
(e.g. a CArray) create an indexing table so that we can search
"""
raise NotImplementedError("WORKTable needs to implement write")
class LegacyTable(Table):
""" an appendable table: allow append/query/delete operations to a
(possibly) already existing appendable table this table ALLOWS
append (but doesn't require them), and stores the data in a format
that can be easily searched
"""
_indexables = [
IndexCol(name='index', axis=1, pos=0),
IndexCol(name='column', axis=2, pos=1, index_kind='columns_kind'),
DataCol(name='fields', cname='values', kind_attr='fields', pos=2)
]
table_type = u('legacy')
ndim = 3
def write(self, **kwargs):
raise TypeError("write operations are not allowed on legacy tables!")
def read(self, where=None, columns=None, **kwargs):
"""we have n indexable columns, with an arbitrary number of data
axes
"""
if not self.read_axes(where=where, **kwargs):
return None
lst_vals = [a.values for a in self.index_axes]
labels, levels = _factorize_from_iterables(lst_vals)
# labels and levels are tuples but lists are expected
labels = list(labels)
levels = list(levels)
N = [len(lvl) for lvl in levels]
# compute the key
key = _factor_indexer(N[1:], labels)
objs = []
if len(unique(key)) == len(key):
sorter, _ = algos.groupsort_indexer(
_ensure_int64(key), np.prod(N))
sorter = _ensure_platform_int(sorter)
# create the objs
for c in self.values_axes:
# the data need to be sorted
sorted_values = c.take_data().take(sorter, axis=0)
if sorted_values.ndim == 1:
sorted_values = sorted_values.reshape(
(sorted_values.shape[0], 1))
take_labels = [l.take(sorter) for l in labels]
items = Index(c.values)
block = _block2d_to_blocknd(
values=sorted_values, placement=np.arange(len(items)),
shape=tuple(N), labels=take_labels, ref_items=items)
# create the object
mgr = BlockManager([block], [items] + levels)
obj = self.obj_type(mgr)
# permute if needed
if self.is_transposed:
obj = obj.transpose(
*tuple(Series(self.data_orientation).argsort()))
objs.append(obj)
else:
warnings.warn(duplicate_doc, DuplicateWarning, stacklevel=5)
# reconstruct
long_index = MultiIndex.from_arrays(
[i.values for i in self.index_axes])
for c in self.values_axes:
lp = DataFrame(c.data, index=long_index, columns=c.values)
# need a better algorithm
tuple_index = long_index.values
unique_tuples = unique(tuple_index)
unique_tuples = com._asarray_tuplesafe(unique_tuples)
indexer = match(unique_tuples, tuple_index)
indexer = _ensure_platform_int(indexer)
new_index = long_index.take(indexer)
new_values = lp.values.take(indexer, axis=0)
lp = DataFrame(new_values, index=new_index, columns=lp.columns)
objs.append(lp.to_panel())
# create the composite object
if len(objs) == 1:
wp = objs[0]
else:
wp = concat(objs, axis=0, verify_integrity=False)._consolidate()
# apply the selection filters & axis orderings
wp = self.process_axes(wp, columns=columns)
return wp
class LegacyFrameTable(LegacyTable):
""" support the legacy frame table """
pandas_kind = u('frame_table')
table_type = u('legacy_frame')
obj_type = Panel
def read(self, *args, **kwargs):
return super(LegacyFrameTable, self).read(*args, **kwargs)['value']
class LegacyPanelTable(LegacyTable):
""" support the legacy panel table """
table_type = u('legacy_panel')
obj_type = Panel
class AppendableTable(LegacyTable):
""" suppor the new appendable table formats """
_indexables = None
table_type = u('appendable')
def write(self, obj, axes=None, append=False, complib=None,
complevel=None, fletcher32=None, min_itemsize=None,
chunksize=None, expectedrows=None, dropna=False, **kwargs):
if not append and self.is_exists:
self._handle.remove_node(self.group, 'table')
# create the axes
self.create_axes(axes=axes, obj=obj, validate=append,
min_itemsize=min_itemsize,
**kwargs)
for a in self.axes:
a.validate(self, append)
if not self.is_exists:
# create the table
options = self.create_description(complib=complib,
complevel=complevel,
fletcher32=fletcher32,
expectedrows=expectedrows)
# set the table attributes
self.set_attrs()
# create the table
self._handle.create_table(self.group, **options)
else:
pass
# table = self.table
# update my info
self.set_info()
# validate the axes and set the kinds
for a in self.axes:
a.validate_and_set(self, append)
# add the rows
self.write_data(chunksize, dropna=dropna)
def write_data(self, chunksize, dropna=False):
""" we form the data into a 2-d including indexes,values,mask
write chunk-by-chunk """
names = self.dtype.names
nrows = self.nrows_expected
# if dropna==True, then drop ALL nan rows
masks = []
if dropna:
for a in self.values_axes:
# figure the mask: only do if we can successfully process this
# column, otherwise ignore the mask
mask = isna(a.data).all(axis=0)
if isinstance(mask, np.ndarray):
masks.append(mask.astype('u1', copy=False))
# consolidate masks
if len(masks):
mask = masks[0]
for m in masks[1:]:
mask = mask & m
mask = mask.ravel()
else:
mask = None
# broadcast the indexes if needed
indexes = [a.cvalues for a in self.index_axes]
nindexes = len(indexes)
bindexes = []
for i, idx in enumerate(indexes):
# broadcast to all other indexes except myself
if i > 0 and i < nindexes:
repeater = np.prod(
[indexes[bi].shape[0] for bi in range(0, i)])
idx = np.tile(idx, repeater)
if i < nindexes - 1:
repeater = np.prod([indexes[bi].shape[0]
for bi in range(i + 1, nindexes)])
idx = np.repeat(idx, repeater)
bindexes.append(idx)
# transpose the values so first dimension is last
# reshape the values if needed
values = [a.take_data() for a in self.values_axes]
values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1))
for v in values]
bvalues = []
for i, v in enumerate(values):
new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
bvalues.append(values[i].reshape(new_shape))
# write the chunks
if chunksize is None:
chunksize = 100000
rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
chunks = int(nrows / chunksize) + 1
for i in range(chunks):
start_i = i * chunksize
end_i = min((i + 1) * chunksize, nrows)
if start_i >= end_i:
break
self.write_data_chunk(
rows,
indexes=[a[start_i:end_i] for a in bindexes],
mask=mask[start_i:end_i] if mask is not None else None,
values=[v[start_i:end_i] for v in bvalues])
def write_data_chunk(self, rows, indexes, mask, values):
"""
Parameters
----------
rows : an empty memory space where we are putting the chunk
indexes : an array of the indexes
mask : an array of the masks
values : an array of the values
"""
# 0 len
for v in values:
if not np.prod(v.shape):
return
try:
nrows = indexes[0].shape[0]
if nrows != len(rows):
rows = np.empty(nrows, dtype=self.dtype)
names = self.dtype.names
nindexes = len(indexes)
# indexes
for i, idx in enumerate(indexes):
rows[names[i]] = idx
# values
for i, v in enumerate(values):
rows[names[i + nindexes]] = v
# mask
if mask is not None:
m = ~mask.ravel().astype(bool, copy=False)
if not m.all():
rows = rows[m]
except Exception as detail:
raise Exception("cannot create row-data -> %s" % detail)
try:
if len(rows):
self.table.append(rows)
self.table.flush()
except Exception as detail:
raise TypeError("tables cannot write this data -> %s" % detail)
def delete(self, where=None, start=None, stop=None, **kwargs):
# delete all rows (and return the nrows)
if where is None or not len(where):
if start is None and stop is None:
nrows = self.nrows
self._handle.remove_node(self.group, recursive=True)
else:
# pytables<3.0 would remove a single row with stop=None
if stop is None:
stop = self.nrows
nrows = self.table.remove_rows(start=start, stop=stop)
self.table.flush()
return nrows
# infer the data kind
if not self.infer_axes():
return None
# create the selection
table = self.table
self.selection = Selection(
self, where, start=start, stop=stop, **kwargs)
values = self.selection.select_coords()
# delete the rows in reverse order
l = Series(values).sort_values()
ln = len(l)
if ln:
# construct groups of consecutive rows
diff = l.diff()
groups = list(diff[diff > 1].index)
# 1 group
if not len(groups):
groups = [0]
# final element
if groups[-1] != ln:
groups.append(ln)
# initial element
if groups[0] != 0:
groups.insert(0, 0)
# we must remove in reverse order!
pg = groups.pop()
for g in reversed(groups):
rows = l.take(lrange(g, pg))
table.remove_rows(start=rows[rows.index[0]
], stop=rows[rows.index[-1]] + 1)
pg = g
self.table.flush()
# return the number of rows removed
return ln
class AppendableFrameTable(AppendableTable):
""" suppor the new appendable table formats """
pandas_kind = u('frame_table')
table_type = u('appendable_frame')
ndim = 2
obj_type = DataFrame
@property
def is_transposed(self):
return self.index_axes[0].axis == 1
def get_object(self, obj):
""" these are written transposed """
if self.is_transposed:
obj = obj.T
return obj
def read(self, where=None, columns=None, **kwargs):
if not self.read_axes(where=where, **kwargs):
return None
info = (self.info.get(self.non_index_axes[0][0], dict())
if len(self.non_index_axes) else dict())
index = self.index_axes[0].values
frames = []
for a in self.values_axes:
# we could have a multi-index constructor here
# _ensure_index doesn't recognized our list-of-tuples here
if info.get('type') == 'MultiIndex':
cols = MultiIndex.from_tuples(a.values)
else:
cols = Index(a.values)
names = info.get('names')
if names is not None:
cols.set_names(names, inplace=True)
if self.is_transposed:
values = a.cvalues
index_ = cols
cols_ = Index(index, name=getattr(index, 'name', None))
else:
values = a.cvalues.T
index_ = Index(index, name=getattr(index, 'name', None))
cols_ = cols
# if we have a DataIndexableCol, its shape will only be 1 dim
if values.ndim == 1 and isinstance(values, np.ndarray):
values = values.reshape((1, values.shape[0]))
block = make_block(values, placement=np.arange(len(cols_)))
mgr = BlockManager([block], [cols_, index_])
frames.append(DataFrame(mgr))
if len(frames) == 1:
df = frames[0]
else:
df = concat(frames, axis=1)
# apply the selection filters & axis orderings
df = self.process_axes(df, columns=columns)
return df
class AppendableSeriesTable(AppendableFrameTable):
""" support the new appendable table formats """
pandas_kind = u('series_table')
table_type = u('appendable_series')
ndim = 2
obj_type = Series
storage_obj_type = DataFrame
@property
def is_transposed(self):
return False
def get_object(self, obj):
return obj
def write(self, obj, data_columns=None, **kwargs):
""" we are going to write this as a frame table """
if not isinstance(obj, DataFrame):
name = obj.name or 'values'
obj = DataFrame({name: obj}, index=obj.index)
obj.columns = [name]
return super(AppendableSeriesTable, self).write(
obj=obj, data_columns=obj.columns.tolist(), **kwargs)
def read(self, columns=None, **kwargs):
is_multi_index = self.is_multi_index
if columns is not None and is_multi_index:
for n in self.levels:
if n not in columns:
columns.insert(0, n)
s = super(AppendableSeriesTable, self).read(columns=columns, **kwargs)
if is_multi_index:
s.set_index(self.levels, inplace=True)
s = s.iloc[:, 0]
# remove the default name
if s.name == 'values':
s.name = None
return s
class AppendableMultiSeriesTable(AppendableSeriesTable):
""" support the new appendable table formats """
pandas_kind = u('series_table')
table_type = u('appendable_multiseries')
def write(self, obj, **kwargs):
""" we are going to write this as a frame table """
name = obj.name or 'values'
obj, self.levels = self.validate_multiindex(obj)
cols = list(self.levels)
cols.append(name)
obj.columns = cols
return super(AppendableMultiSeriesTable, self).write(obj=obj, **kwargs)
class GenericTable(AppendableFrameTable):
""" a table that read/writes the generic pytables table format """
pandas_kind = u('frame_table')
table_type = u('generic_table')
ndim = 2
obj_type = DataFrame
@property
def pandas_type(self):
return self.pandas_kind
@property
def storable(self):
return getattr(self.group, 'table', None) or self.group
def get_attrs(self):
""" retrieve our attributes """
self.non_index_axes = []
self.nan_rep = None
self.levels = []
self.index_axes = [a.infer(self)
for a in self.indexables if a.is_an_indexable]
self.values_axes = [a.infer(self)
for a in self.indexables if not a.is_an_indexable]
self.data_columns = [a.name for a in self.values_axes]
@property
def indexables(self):
""" create the indexables from the table description """
if self._indexables is None:
d = self.description
# the index columns is just a simple index
self._indexables = [GenericIndexCol(name='index', axis=0)]
for i, n in enumerate(d._v_names):
dc = GenericDataIndexableCol(
name=n, pos=i, values=[n], version=self.version)
self._indexables.append(dc)
return self._indexables
def write(self, **kwargs):
raise NotImplementedError("cannot write on an generic table")
class AppendableMultiFrameTable(AppendableFrameTable):
""" a frame with a multi-index """
table_type = u('appendable_multiframe')
obj_type = DataFrame
ndim = 2
_re_levels = re.compile(r"^level_\d+$")
@property
def table_type_short(self):
return u('appendable_multi')
def write(self, obj, data_columns=None, **kwargs):
if data_columns is None:
data_columns = []
elif data_columns is True:
data_columns = obj.columns.tolist()
obj, self.levels = self.validate_multiindex(obj)
for n in self.levels:
if n not in data_columns:
data_columns.insert(0, n)
return super(AppendableMultiFrameTable, self).write(
obj=obj, data_columns=data_columns, **kwargs)
def read(self, **kwargs):
df = super(AppendableMultiFrameTable, self).read(**kwargs)
df = df.set_index(self.levels)
# remove names for 'level_%d'
df.index = df.index.set_names([
None if self._re_levels.search(l) else l for l in df.index.names
])
return df
class AppendablePanelTable(AppendableTable):
""" suppor the new appendable table formats """
table_type = u('appendable_panel')
ndim = 3
obj_type = Panel
def get_object(self, obj):
""" these are written transposed """
if self.is_transposed:
obj = obj.transpose(*self.data_orientation)
return obj
@property
def is_transposed(self):
return self.data_orientation != tuple(range(self.ndim))
def _reindex_axis(obj, axis, labels, other=None):
ax = obj._get_axis(axis)
labels = _ensure_index(labels)
# try not to reindex even if other is provided
# if it equals our current index
if other is not None:
other = _ensure_index(other)
if (other is None or labels.equals(other)) and labels.equals(ax):
return obj
labels = _ensure_index(labels.unique())
if other is not None:
labels = _ensure_index(other.unique()) & labels
if not labels.equals(ax):
slicer = [slice(None, None)] * obj.ndim
slicer[axis] = labels
obj = obj.loc[tuple(slicer)]
return obj
def _get_info(info, name):
""" get/create the info for this name """
try:
idx = info[name]
except:
idx = info[name] = dict()
return idx
# tz to/from coercion
def _get_tz(tz):
""" for a tz-aware type, return an encoded zone """
zone = timezones.get_timezone(tz)
if zone is None:
zone = tz.utcoffset().total_seconds()
return zone
def _set_tz(values, tz, preserve_UTC=False, coerce=False):
"""
coerce the values to a DatetimeIndex if tz is set
preserve the input shape if possible
Parameters
----------
values : ndarray
tz : string/pickled tz object
preserve_UTC : boolean,
preserve the UTC of the result
coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
"""
if tz is not None:
name = getattr(values, 'name', None)
values = values.ravel()
tz = timezones.get_timezone(_ensure_decoded(tz))
values = DatetimeIndex(values, name=name)
if values.tz is None:
values = values.tz_localize('UTC').tz_convert(tz)
if preserve_UTC:
if tz == 'UTC':
values = list(values)
elif coerce:
values = np.asarray(values, dtype='M8[ns]')
return values
def _convert_index(index, encoding=None, errors='strict', format_type=None):
index_name = getattr(index, 'name', None)
if isinstance(index, DatetimeIndex):
converted = index.asi8
return IndexCol(converted, 'datetime64', _tables().Int64Col(),
freq=getattr(index, 'freq', None),
tz=getattr(index, 'tz', None),
index_name=index_name)
elif isinstance(index, TimedeltaIndex):
converted = index.asi8
return IndexCol(converted, 'timedelta64', _tables().Int64Col(),
freq=getattr(index, 'freq', None),
index_name=index_name)
elif isinstance(index, (Int64Index, PeriodIndex)):
atom = _tables().Int64Col()
# avoid to store ndarray of Period objects
return IndexCol(index._ndarray_values, 'integer', atom,
freq=getattr(index, 'freq', None),
index_name=index_name)
if isinstance(index, MultiIndex):
raise TypeError('MultiIndex not supported here!')
inferred_type = lib.infer_dtype(index)
values = np.asarray(index)
if inferred_type == 'datetime64':
converted = values.view('i8')
return IndexCol(converted, 'datetime64', _tables().Int64Col(),
freq=getattr(index, 'freq', None),
tz=getattr(index, 'tz', None),
index_name=index_name)
elif inferred_type == 'timedelta64':
converted = values.view('i8')
return IndexCol(converted, 'timedelta64', _tables().Int64Col(),
freq=getattr(index, 'freq', None),
index_name=index_name)
elif inferred_type == 'datetime':
converted = np.asarray([(time.mktime(v.timetuple()) +
v.microsecond / 1E6) for v in values],
dtype=np.float64)
return IndexCol(converted, 'datetime', _tables().Time64Col(),
index_name=index_name)
elif inferred_type == 'date':
converted = np.asarray([v.toordinal() for v in values],
dtype=np.int32)
return IndexCol(converted, 'date', _tables().Time32Col(),
index_name=index_name)
elif inferred_type == 'string':
# atom = _tables().ObjectAtom()
# return np.asarray(values, dtype='O'), 'object', atom
converted = _convert_string_array(values, encoding, errors)
itemsize = converted.dtype.itemsize
return IndexCol(
converted, 'string', _tables().StringCol(itemsize),
itemsize=itemsize, index_name=index_name
)
elif inferred_type == 'unicode':
if format_type == 'fixed':
atom = _tables().ObjectAtom()
return IndexCol(np.asarray(values, dtype='O'), 'object', atom,
index_name=index_name)
raise TypeError(
"[unicode] is not supported as a in index type for [{0}] formats"
.format(format_type)
)
elif inferred_type == 'integer':
# take a guess for now, hope the values fit
atom = _tables().Int64Col()
return IndexCol(np.asarray(values, dtype=np.int64), 'integer', atom,
index_name=index_name)
elif inferred_type == 'floating':
atom = _tables().Float64Col()
return IndexCol(np.asarray(values, dtype=np.float64), 'float', atom,
index_name=index_name)
else: # pragma: no cover
atom = _tables().ObjectAtom()
return IndexCol(np.asarray(values, dtype='O'), 'object', atom,
index_name=index_name)
def _unconvert_index(data, kind, encoding=None, errors='strict'):
kind = _ensure_decoded(kind)
if kind == u('datetime64'):
index = DatetimeIndex(data)
elif kind == u('timedelta64'):
index = TimedeltaIndex(data)
elif kind == u('datetime'):
index = np.asarray([datetime.fromtimestamp(v) for v in data],
dtype=object)
elif kind == u('date'):
try:
index = np.asarray(
[date.fromordinal(v) for v in data], dtype=object)
except (ValueError):
index = np.asarray(
[date.fromtimestamp(v) for v in data], dtype=object)
elif kind in (u('integer'), u('float')):
index = np.asarray(data)
elif kind in (u('string')):
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding,
errors=errors)
elif kind == u('object'):
index = np.asarray(data[0])
else: # pragma: no cover
raise ValueError('unrecognized index type %s' % kind)
return index
def _unconvert_index_legacy(data, kind, legacy=False, encoding=None,
errors='strict'):
kind = _ensure_decoded(kind)
if kind == u('datetime'):
index = to_datetime(data)
elif kind in (u('integer')):
index = np.asarray(data, dtype=object)
elif kind in (u('string')):
index = _unconvert_string_array(data, nan_rep=None, encoding=encoding,
errors=errors)
else: # pragma: no cover
raise ValueError('unrecognized index type %s' % kind)
return index
def _convert_string_array(data, encoding, errors, itemsize=None):
"""
we take a string-like that is object dtype and coerce to a fixed size
string type
Parameters
----------
data : a numpy array of object dtype
encoding : None or string-encoding
errors : handler for encoding errors
itemsize : integer, optional, defaults to the max length of the strings
Returns
-------
data in a fixed-length string dtype, encoded to bytes if needed
"""
# encode if needed
if encoding is not None and len(data):
data = Series(data.ravel()).str.encode(
encoding, errors).values.reshape(data.shape)
# create the sized dtype
if itemsize is None:
ensured = _ensure_object(data.ravel())
itemsize = libwriters.max_len_string_array(ensured)
data = np.asarray(data, dtype="S%d" % itemsize)
return data
def _unconvert_string_array(data, nan_rep=None, encoding=None,
errors='strict'):
"""
inverse of _convert_string_array
Parameters
----------
data : fixed length string dtyped array
nan_rep : the storage repr of NaN, optional
encoding : the encoding of the data, optional
errors : handler for encoding errors, default 'strict'
Returns
-------
an object array of the decoded data
"""
shape = data.shape
data = np.asarray(data.ravel(), dtype=object)
# guard against a None encoding in PY3 (because of a legacy
# where the passed encoding is actually None)
encoding = _ensure_encoding(encoding)
if encoding is not None and len(data):
itemsize = libwriters.max_len_string_array(_ensure_object(data))
if compat.PY3:
dtype = "U{0}".format(itemsize)
else:
dtype = "S{0}".format(itemsize)
if isinstance(data[0], compat.binary_type):
data = Series(data).str.decode(encoding, errors=errors).values
else:
data = data.astype(dtype, copy=False).astype(object, copy=False)
if nan_rep is None:
nan_rep = 'nan'
data = libwriters.string_array_replace_from_nan_rep(data, nan_rep)
return data.reshape(shape)
def _maybe_convert(values, val_kind, encoding, errors):
if _need_convert(val_kind):
conv = _get_converter(val_kind, encoding, errors)
# conv = np.frompyfunc(conv, 1, 1)
values = conv(values)
return values
def _get_converter(kind, encoding, errors):
kind = _ensure_decoded(kind)
if kind == 'datetime64':
return lambda x: np.asarray(x, dtype='M8[ns]')
elif kind == 'datetime':
return lambda x: to_datetime(x, cache=True).to_pydatetime()
elif kind == 'string':
return lambda x: _unconvert_string_array(x, encoding=encoding,
errors=errors)
else: # pragma: no cover
raise ValueError('invalid kind %s' % kind)
def _need_convert(kind):
kind = _ensure_decoded(kind)
if kind in (u('datetime'), u('datetime64'), u('string')):
return True
return False
class Selection(object):
"""
Carries out a selection operation on a tables.Table object.
Parameters
----------
table : a Table object
where : list of Terms (or convertible to)
start, stop: indicies to start and/or stop selection
"""
def __init__(self, table, where=None, start=None, stop=None, **kwargs):
self.table = table
self.where = where
self.start = start
self.stop = stop
self.condition = None
self.filter = None
self.terms = None
self.coordinates = None
if is_list_like(where):
# see if we have a passed coordinate like
try:
inferred = lib.infer_dtype(where)
if inferred == 'integer' or inferred == 'boolean':
where = np.asarray(where)
if where.dtype == np.bool_:
start, stop = self.start, self.stop
if start is None:
start = 0
if stop is None:
stop = self.table.nrows
self.coordinates = np.arange(start, stop)[where]
elif issubclass(where.dtype.type, np.integer):
if ((self.start is not None and
(where < self.start).any()) or
(self.stop is not None and
(where >= self.stop).any())):
raise ValueError(
"where must have index locations >= start and "
"< stop"
)
self.coordinates = where
except:
pass
if self.coordinates is None:
self.terms = self.generate(where)
# create the numexpr & the filter
if self.terms is not None:
self.condition, self.filter = self.terms.evaluate()
def generate(self, where):
""" where can be a : dict,list,tuple,string """
if where is None:
return None
q = self.table.queryables()
try:
return Expr(where, queryables=q, encoding=self.table.encoding)
except NameError:
# raise a nice message, suggesting that the user should use
# data_columns
raise ValueError(
"The passed where expression: {0}\n"
" contains an invalid variable reference\n"
" all of the variable references must be a "
"reference to\n"
" an axis (e.g. 'index' or 'columns'), or a "
"data_column\n"
" The currently defined references are: {1}\n"
.format(where, ','.join(q.keys()))
)
def select(self):
"""
generate the selection
"""
if self.condition is not None:
return self.table.table.read_where(self.condition.format(),
start=self.start,
stop=self.stop)
elif self.coordinates is not None:
return self.table.table.read_coordinates(self.coordinates)
return self.table.table.read(start=self.start, stop=self.stop)
def select_coords(self):
"""
generate the selection
"""
start, stop = self.start, self.stop
nrows = self.table.nrows
if start is None:
start = 0
elif start < 0:
start += nrows
if self.stop is None:
stop = nrows
elif stop < 0:
stop += nrows
if self.condition is not None:
return self.table.table.get_where_list(self.condition.format(),
start=start, stop=stop,
sort=True)
elif self.coordinates is not None:
return self.coordinates
return np.arange(start, stop)
# utilities ###
def timeit(key, df, fn=None, remove=True, **kwargs):
if fn is None:
fn = 'timeit.h5'
store = HDFStore(fn, mode='w')
store.append(key, df, **kwargs)
store.close()
if remove:
os.remove(fn)