You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

620 lines
21 KiB

# encoding: utf-8
"""
# hickle.py
Created by Danny Price 2016-02-03.
Hickle is a HDF5 based clone of Pickle. Instead of serializing to a pickle
file, Hickle dumps to a HDF5 file. It is designed to be as similar to pickle in
usage as possible, providing a load() and dump() function.
## Notes
Hickle has two main advantages over Pickle:
1) LARGE PICKLE HANDLING. Unpickling a large pickle is slow, as the Unpickler
reads the entire pickle thing and loads it into memory. In comparison, HDF5
files are designed for large datasets. Things are only loaded when accessed.
2) CROSS PLATFORM SUPPORT. Attempting to unpickle a pickle pickled on Windows
on Linux and vice versa is likely to fail with errors like "Insecure string
pickle". HDF5 files will load fine, as long as both machines have
h5py installed.
"""
from __future__ import absolute_import
import sys
import os
from pkg_resources import get_distribution
import numpy as np
import h5py as h5
from .helpers import get_type_and_data, sort_keys, check_is_iterable, check_iterable_item_type
from .lookup import types_dict, hkl_types_dict, types_not_to_sort, \
container_types_dict, container_key_types_dict
from .lookup import check_is_ndarray_like
try:
from exceptions import Exception
from types import NoneType
except ImportError:
pass # above imports will fail in python3
import six
import io
# Import a default 'pickler'
# Not the nicest import code, but should work on Py2/Py3
try:
import dill as pickle
except ImportError:
try:
import cPickle as pickle
except ImportError:
import pickle
except ModuleNotFoundError:
import pickle
except ModuleNotFoundError:
import pickle
import warnings
from pkg_resources import get_distribution, DistributionNotFound
try:
__version__ = get_distribution('hickle').version
except DistributionNotFound:
__version__ = '0.0.0 - please install via pip/setup.py'
##################
# Error handling #
##################
class FileError(Exception):
""" An exception raised if the file is fishy """
def __init__(self):
return
def __str__(self):
return ("Cannot open file. Please pass either a filename "
"string, a file object, or a h5py.File")
class ClosedFileError(Exception):
""" An exception raised if the file is fishy """
def __init__(self):
return
def __str__(self):
return ("HDF5 file has been closed. Please pass either "
"a filename string, a file object, or an open h5py.File")
class NoMatchError(Exception):
""" An exception raised if the object type is not understood (or
supported)"""
def __init__(self):
return
def __str__(self):
return ("Error: this type of python object cannot be converted into a "
"hickle.")
class ToDoError(Exception):
""" An exception raised for non-implemented functionality"""
def __init__(self):
return
def __str__(self):
return "Error: this functionality hasn't been implemented yet."
class SerializedWarning(UserWarning):
""" An object type was not understood
The data will be serialized using pickle.
"""
pass
######################
# H5PY file wrappers #
######################
class H5GroupWrapper(h5.Group):
""" Group wrapper that provides a track_times kwarg.
track_times is a boolean flag that can be set to False, so that two
files created at different times will have identical MD5 hashes.
"""
def create_dataset(self, *args, **kwargs):
kwargs['track_times'] = getattr(self, 'track_times', True)
return super(H5GroupWrapper, self).create_dataset(*args, **kwargs)
def create_group(self, *args, **kwargs):
group = super(H5GroupWrapper, self).create_group(*args, **kwargs)
group.__class__ = H5GroupWrapper
group.track_times = getattr(self, 'track_times', True)
return group
class H5FileWrapper(h5.File):
""" Wrapper for h5py File that provides a track_times kwarg.
track_times is a boolean flag that can be set to False, so that two
files created at different times will have identical MD5 hashes.
"""
def create_dataset(self, *args, **kwargs):
kwargs['track_times'] = getattr(self, 'track_times', True)
return super(H5FileWrapper, self).create_dataset(*args, **kwargs)
def create_group(self, *args, **kwargs):
group = super(H5FileWrapper, self).create_group(*args, **kwargs)
group.__class__ = H5GroupWrapper
group.track_times = getattr(self, 'track_times', True)
return group
def file_opener(f, mode='r', track_times=True):
""" A file opener helper function with some error handling. This can open
files through a file object, a h5py file, or just the filename.
Args:
f (file, h5py.File, or string): File-identifier, e.g. filename or file object.
mode (str): File open mode. Only required if opening by filename string.
track_times (bool): Track time in HDF5; turn off if you want hickling at
different times to produce identical files (e.g. for MD5 hash check).
"""
# Were we handed a file object or just a file name string?
if six.PY2:
if isinstance(f, file):
filename, mode = f.name, f.mode
f.close()
h5f = h5.File(filename, mode)
elif isinstance(f, str) or isinstance(f, unicode):
filename = f
h5f = h5.File(filename, mode)
elif isinstance(f, H5FileWrapper) or isinstance(f, h5._hl.files.File):
try:
filename = f.filename
except ValueError:
raise ClosedFileError()
h5f = f
else:
print(type(f))
raise FileError
else:
if isinstance(f, io.TextIOWrapper):
filename, mode = f.name, f.mode
f.close()
h5f = h5.File(filename, mode)
elif isinstance(f, str) or isinstance(f, bytes):
filename = f
h5f = h5.File(filename, mode)
elif isinstance(f, H5FileWrapper) or isinstance(f, h5._hl.files.File):
try:
filename = f.filename
except ValueError:
raise ClosedFileError()
h5f = f
else:
print(type(f))
raise FileError
h5f.__class__ = H5FileWrapper
h5f.track_times = track_times
return h5f
###########
# DUMPERS #
###########
def _dump(py_obj, h_group, call_id=0, **kwargs):
""" Dump a python object to a group within a HDF5 file.
This function is called recursively by the main dump() function.
Args:
py_obj: python object to dump.
h_group (h5.File.group): group to dump data into.
call_id (int): index to identify object's relative location in the iterable.
"""
if six.PY2:
dumpable_dtypes = (bool, int, float, long, complex, str, unicode)
else:
dumpable_dtypes = (bool, int, float, complex, bytes, str)
# Firstly, check if item is a numpy array. If so, just dump it.
if check_is_ndarray_like(py_obj):
create_hkl_dataset(py_obj, h_group, call_id, **kwargs)
# next, check if item is iterable
elif check_is_iterable(py_obj):
item_type = check_iterable_item_type(py_obj)
# item_type == False implies multiple types. Create a dataset
if item_type is False:
h_subgroup = create_hkl_group(py_obj, h_group, call_id)
for ii, py_subobj in enumerate(py_obj):
_dump(py_subobj, h_subgroup, call_id=ii, **kwargs)
# otherwise, subitems have same type. Check if subtype is an iterable
# (e.g. list of lists), or not (e.g. list of ints, which should be treated
# as a single dataset).
else:
if item_type in dumpable_dtypes:
create_hkl_dataset(py_obj, h_group, call_id, **kwargs)
else:
h_subgroup = create_hkl_group(py_obj, h_group, call_id)
for ii, py_subobj in enumerate(py_obj):
_dump(py_subobj, h_subgroup, call_id=ii, **kwargs)
# item is not iterable, so create a dataset for it
else:
create_hkl_dataset(py_obj, h_group, call_id, **kwargs)
def dump(py_obj, file_obj, mode='w', track_times=True, path='/', **kwargs):
""" Write a pickled representation of obj to the open file object file.
Args:
obj (object): python object o store in a Hickle
file: file object, filename string, or h5py.File object
file in which to store the object. A h5py.File or a filename is also
acceptable.
mode (str): optional argument, 'r' (read only), 'w' (write) or 'a' (append).
Ignored if file is a file object.
compression (str): optional argument. Applies compression to dataset. Options: None, gzip,
lzf (+ szip, if installed)
track_times (bool): optional argument. If set to False, repeated hickling will produce
identical files.
path (str): path within hdf5 file to save data to. Defaults to root /
"""
try:
# Open the file
h5f = file_opener(file_obj, mode, track_times)
h5f.attrs[b"CLASS"] = b'hickle'
h5f.attrs[b"VERSION"] = get_distribution('hickle').version
h5f.attrs[b"type"] = [b'hickle']
# Log which version of python was used to generate the hickle file
pv = sys.version_info
py_ver = "%i.%i.%i" % (pv[0], pv[1], pv[2])
h5f.attrs[b"PYTHON_VERSION"] = py_ver
h_root_group = h5f.get(path)
if h_root_group is None:
h_root_group = h5f.create_group(path)
h_root_group.attrs[b"type"] = [b'hickle']
_dump(py_obj, h_root_group, **kwargs)
h5f.close()
except NoMatchError:
fname = h5f.filename
h5f.close()
try:
os.remove(fname)
except OSError:
warnings.warn("Dump failed. Could not remove %s" % fname)
finally:
raise NoMatchError
def create_dataset_lookup(py_obj):
""" What type of object are we trying to pickle? This is a python
dictionary based equivalent of a case statement. It returns the correct
helper function for a given data type.
Args:
py_obj: python object to look-up what function to use to dump to disk
Returns:
match: function that should be used to dump data to a new dataset
"""
t = type(py_obj)
types_lookup = {dict: create_dict_dataset}
types_lookup.update(types_dict)
match = types_lookup.get(t, no_match)
return match
def create_hkl_dataset(py_obj, h_group, call_id=0, **kwargs):
""" Create a dataset within the hickle HDF5 file
Args:
py_obj: python object to dump.
h_group (h5.File.group): group to dump data into.
call_id (int): index to identify object's relative location in the iterable.
"""
#lookup dataset creator type based on python object type
create_dataset = create_dataset_lookup(py_obj)
# do the creation
create_dataset(py_obj, h_group, call_id, **kwargs)
def create_hkl_group(py_obj, h_group, call_id=0):
""" Create a new group within the hickle file
Args:
h_group (h5.File.group): group to dump data into.
call_id (int): index to identify object's relative location in the iterable.
"""
h_subgroup = h_group.create_group('data_%i' % call_id)
if six.PY2:
h_subgroup.attrs["type"] = [str(type(py_obj))]
else:
h_subgroup.attrs["type"] = [bytes(str(type(py_obj)), 'ascii')]
return h_subgroup
def create_dict_dataset(py_obj, h_group, call_id=0, **kwargs):
""" Creates a data group for each key in dictionary
Notes:
This is a very important function which uses the recursive _dump
method to build up hierarchical data models stored in the HDF5 file.
As this is critical to functioning, it is kept in the main hickle.py
file instead of in the loaders/ directory.
Args:
py_obj: python object to dump; should be dictionary
h_group (h5.File.group): group to dump data into.
call_id (int): index to identify object's relative location in the iterable.
"""
h_dictgroup = h_group.create_group('data_%i' % call_id)
h_dictgroup.attrs["type"] = [b'dict']
for key, py_subobj in py_obj.items():
if six.PY2:
if type(key) in (unicode, str):
h_subgroup = h_dictgroup.create_group(key)
else:
h_subgroup = h_dictgroup.create_group(str(key))
else:
h_subgroup = h_dictgroup.create_group(str(key))
h_subgroup.attrs["type"] = [b'dict_item']
if six.PY2:
h_subgroup.attrs["key_type"] = [str(type(key))]
else:
tk = str(type(key)).encode('utf-8')
h_subgroup.attrs["key_type"] = [tk]
_dump(py_subobj, h_subgroup, call_id=0, **kwargs)
def no_match(py_obj, h_group, call_id=0, **kwargs):
""" If no match is made, raise an exception
Args:
py_obj: python object to dump; default if item is not matched.
h_group (h5.File.group): group to dump data into.
call_id (int): index to identify object's relative location in the iterable.
"""
pickled_obj = pickle.dumps(py_obj)
d = h_group.create_dataset('data_%i' % call_id, data=[pickled_obj])
d.attrs["type"] = [b'pickle']
warnings.warn("%s type not understood, data have been serialized" % type(py_obj),
SerializedWarning)
#############
## LOADERS ##
#############
class PyContainer(list):
""" A group-like object into which to load datasets.
In order to build up a tree-like structure, we need to be able
to load datasets into a container with an append() method.
Python tuples and sets do not allow this. This class provides
a list-like object that be converted into a list, tuple, set or dict.
"""
def __init__(self):
super(PyContainer, self).__init__()
self.container_type = None
self.name = None
self.key_type = None
def convert(self):
""" Convert from PyContainer to python core data type.
Returns: self, either as a list, tuple, set or dict
(or other type specified in lookup.py)
"""
if self.container_type in container_types_dict.keys():
convert_fn = container_types_dict[self.container_type]
return convert_fn(self)
if self.container_type == b"dict":
keys = []
for item in self:
key = item.name.split('/')[-1]
key_type = item.key_type[0]
if key_type in container_key_types_dict.keys():
to_type_fn = container_key_types_dict[key_type]
key = to_type_fn(key)
keys.append(key)
items = [item[0] for item in self]
return dict(zip(keys, items))
else:
return self
def no_match_load(key):
""" If no match is made when loading, need to raise an exception
"""
raise RuntimeError("Cannot load %s data type" % key)
#pass
def load_dataset_lookup(key):
""" What type of object are we trying to unpickle? This is a python
dictionary based equivalent of a case statement. It returns the type
a given 'type' keyword in the hickle file.
Args:
py_obj: python object to look-up what function to use to dump to disk
Returns:
match: function that should be used to dump data to a new dataset
"""
match = hkl_types_dict.get(key, no_match_load)
return match
def load(fileobj, path='/', safe=True):
""" Load a hickle file and reconstruct a python object
Args:
fileobj: file object, h5py.File, or filename string
safe (bool): Disable automatic depickling of arbitrary python objects.
DO NOT set this to False unless the file is from a trusted source.
(see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation)
path (str): path within hdf5 file to save data to. Defaults to root /
"""
try:
with file_opener(fileobj) as h5f:
h_root_group = h5f.get(path)
try:
assert 'CLASS' in h5f.attrs.keys()
assert 'VERSION' in h5f.attrs.keys()
VER = h5f.attrs['VERSION']
try:
VER_MAJOR = int(VER)
except ValueError:
VER_MAJOR = int(VER[0])
if VER_MAJOR == 1:
if six.PY2:
warnings.warn("Hickle file versioned as V1, attempting legacy loading...")
from . import hickle_legacy
return hickle_legacy.load(fileobj, safe)
else:
raise RuntimeError("Cannot open file. This file was likely"
" created with Python 2 and an old hickle version.")
elif VER_MAJOR == 2:
if six.PY2:
warnings.warn("Hickle file appears to be old version (v2), attempting "
"legacy loading...")
from . import hickle_legacy2
return hickle_legacy2.load(fileobj, safe=safe)
else:
raise RuntimeError("Cannot open file. This file was likely"
" created with Python 2 and an old hickle version.")
# There is an unfortunate period of time where hickle 2.1.0 claims VERSION = int(3)
# For backward compatibility we really need to catch this.
# Actual hickle v3 files are versioned as A.B.C (e.g. 3.1.0)
elif VER_MAJOR == 3 and VER == VER_MAJOR:
if six.PY2:
warnings.warn("Hickle file appears to be old version (v2.1.0), attempting "
"legacy loading...")
from . import hickle_legacy2
return hickle_legacy2.load(fileobj, safe=safe)
else:
raise RuntimeError("Cannot open file. This file was likely"
" created with Python 2 and an old hickle version.")
elif VER_MAJOR >= 3:
py_container = PyContainer()
py_container.container_type = 'hickle'
py_container = _load(py_container, h_root_group)
return py_container[0][0]
except AssertionError:
if six.PY2:
warnings.warn("Hickle file is not versioned, attempting legacy loading...")
from . import hickle_legacy
return hickle_legacy.load(fileobj, safe)
else:
raise RuntimeError("Cannot open file. This file was likely"
" created with Python 2 and an old hickle version.")
finally:
if 'h5f' in locals():
# Check if file is open, and if so, close it.
if h5f.fid.valid:
h5f.close()
def load_dataset(h_node):
""" Load a dataset, converting into its correct python type
Args:
h_node (h5py dataset): h5py dataset object to read
Returns:
data: reconstructed python object from loaded data
"""
py_type, data = get_type_and_data(h_node)
try:
load_fn = load_dataset_lookup(py_type)
return load_fn(h_node)
except:
raise
#raise RuntimeError("Hickle type %s not understood." % py_type)
def _load(py_container, h_group):
""" Load a hickle file
Recursive funnction to load hdf5 data into a PyContainer()
Args:
py_container (PyContainer): Python container to load data into
h_group (h5 group or dataset): h5py object, group or dataset, to spider
and load all datasets.
"""
group_dtype = h5._hl.group.Group
dataset_dtype = h5._hl.dataset.Dataset
#either a file, group, or dataset
if isinstance(h_group, H5FileWrapper) or isinstance(h_group, group_dtype):
py_subcontainer = PyContainer()
try:
py_subcontainer.container_type = bytes(h_group.attrs['type'][0])
except KeyError:
raise
#py_subcontainer.container_type = ''
py_subcontainer.name = h_group.name
if py_subcontainer.container_type == b'dict_item':
py_subcontainer.key_type = h_group.attrs['key_type']
if py_subcontainer.container_type not in types_not_to_sort:
h_keys = sort_keys(h_group.keys())
else:
h_keys = h_group.keys()
for h_name in h_keys:
h_node = h_group[h_name]
py_subcontainer = _load(py_subcontainer, h_node)
sub_data = py_subcontainer.convert()
py_container.append(sub_data)
else:
# must be a dataset
subdata = load_dataset(h_group)
py_container.append(subdata)
return py_container