|
|
- # encoding: utf-8
- """
- # hickle.py
-
- Created by Danny Price 2016-02-03.
-
- Hickle is a HDF5 based clone of Pickle. Instead of serializing to a pickle
- file, Hickle dumps to a HDF5 file. It is designed to be as similar to pickle in
- usage as possible, providing a load() and dump() function.
-
- ## Notes
-
- Hickle has two main advantages over Pickle:
- 1) LARGE PICKLE HANDLING. Unpickling a large pickle is slow, as the Unpickler
- reads the entire pickle thing and loads it into memory. In comparison, HDF5
- files are designed for large datasets. Things are only loaded when accessed.
-
- 2) CROSS PLATFORM SUPPORT. Attempting to unpickle a pickle pickled on Windows
- on Linux and vice versa is likely to fail with errors like "Insecure string
- pickle". HDF5 files will load fine, as long as both machines have
- h5py installed.
-
- """
-
- from __future__ import absolute_import
- import sys
- import os
- from pkg_resources import get_distribution
-
- import numpy as np
- import h5py as h5
-
-
- from .helpers import get_type_and_data, sort_keys, check_is_iterable, check_iterable_item_type
- from .lookup import types_dict, hkl_types_dict, types_not_to_sort, \
- container_types_dict, container_key_types_dict
- from .lookup import check_is_ndarray_like
-
-
- try:
- from exceptions import Exception
- from types import NoneType
- except ImportError:
- pass # above imports will fail in python3
-
- import six
- import io
-
- # Import a default 'pickler'
- # Not the nicest import code, but should work on Py2/Py3
- try:
- import dill as pickle
- except ImportError:
- try:
- import cPickle as pickle
- except ImportError:
- import pickle
- except ModuleNotFoundError:
- import pickle
- except ModuleNotFoundError:
- import pickle
-
- import warnings
-
- from pkg_resources import get_distribution, DistributionNotFound
- try:
- __version__ = get_distribution('hickle').version
- except DistributionNotFound:
- __version__ = '0.0.0 - please install via pip/setup.py'
-
- ##################
- # Error handling #
- ##################
-
- class FileError(Exception):
- """ An exception raised if the file is fishy """
- def __init__(self):
- return
-
- def __str__(self):
- return ("Cannot open file. Please pass either a filename "
- "string, a file object, or a h5py.File")
-
-
- class ClosedFileError(Exception):
- """ An exception raised if the file is fishy """
- def __init__(self):
- return
-
- def __str__(self):
- return ("HDF5 file has been closed. Please pass either "
- "a filename string, a file object, or an open h5py.File")
-
-
- class NoMatchError(Exception):
- """ An exception raised if the object type is not understood (or
- supported)"""
- def __init__(self):
- return
-
- def __str__(self):
- return ("Error: this type of python object cannot be converted into a "
- "hickle.")
-
-
- class ToDoError(Exception):
- """ An exception raised for non-implemented functionality"""
- def __init__(self):
- return
-
- def __str__(self):
- return "Error: this functionality hasn't been implemented yet."
-
-
- class SerializedWarning(UserWarning):
- """ An object type was not understood
-
- The data will be serialized using pickle.
- """
- pass
-
-
- ######################
- # H5PY file wrappers #
- ######################
-
- class H5GroupWrapper(h5.Group):
- """ Group wrapper that provides a track_times kwarg.
-
- track_times is a boolean flag that can be set to False, so that two
- files created at different times will have identical MD5 hashes.
- """
- def create_dataset(self, *args, **kwargs):
- kwargs['track_times'] = getattr(self, 'track_times', True)
- return super(H5GroupWrapper, self).create_dataset(*args, **kwargs)
-
- def create_group(self, *args, **kwargs):
- group = super(H5GroupWrapper, self).create_group(*args, **kwargs)
- group.__class__ = H5GroupWrapper
- group.track_times = getattr(self, 'track_times', True)
- return group
-
-
- class H5FileWrapper(h5.File):
- """ Wrapper for h5py File that provides a track_times kwarg.
-
- track_times is a boolean flag that can be set to False, so that two
- files created at different times will have identical MD5 hashes.
- """
- def create_dataset(self, *args, **kwargs):
- kwargs['track_times'] = getattr(self, 'track_times', True)
- return super(H5FileWrapper, self).create_dataset(*args, **kwargs)
-
- def create_group(self, *args, **kwargs):
- group = super(H5FileWrapper, self).create_group(*args, **kwargs)
- group.__class__ = H5GroupWrapper
- group.track_times = getattr(self, 'track_times', True)
- return group
-
-
- def file_opener(f, mode='r', track_times=True):
- """ A file opener helper function with some error handling. This can open
- files through a file object, a h5py file, or just the filename.
-
- Args:
- f (file, h5py.File, or string): File-identifier, e.g. filename or file object.
- mode (str): File open mode. Only required if opening by filename string.
- track_times (bool): Track time in HDF5; turn off if you want hickling at
- different times to produce identical files (e.g. for MD5 hash check).
-
- """
-
- # Were we handed a file object or just a file name string?
- if six.PY2:
- if isinstance(f, file):
- filename, mode = f.name, f.mode
- f.close()
- h5f = h5.File(filename, mode)
- elif isinstance(f, str) or isinstance(f, unicode):
- filename = f
- h5f = h5.File(filename, mode)
- elif isinstance(f, H5FileWrapper) or isinstance(f, h5._hl.files.File):
- try:
- filename = f.filename
- except ValueError:
- raise ClosedFileError()
- h5f = f
- else:
- print(type(f))
- raise FileError
-
- else:
- if isinstance(f, io.TextIOWrapper):
- filename, mode = f.name, f.mode
- f.close()
- h5f = h5.File(filename, mode)
- elif isinstance(f, str) or isinstance(f, bytes):
- filename = f
- h5f = h5.File(filename, mode)
- elif isinstance(f, H5FileWrapper) or isinstance(f, h5._hl.files.File):
- try:
- filename = f.filename
- except ValueError:
- raise ClosedFileError()
- h5f = f
- else:
- print(type(f))
- raise FileError
-
-
- h5f.__class__ = H5FileWrapper
- h5f.track_times = track_times
- return h5f
-
-
- ###########
- # DUMPERS #
- ###########
-
-
- def _dump(py_obj, h_group, call_id=0, **kwargs):
- """ Dump a python object to a group within a HDF5 file.
-
- This function is called recursively by the main dump() function.
-
- Args:
- py_obj: python object to dump.
- h_group (h5.File.group): group to dump data into.
- call_id (int): index to identify object's relative location in the iterable.
- """
-
- if six.PY2:
- dumpable_dtypes = (bool, int, float, long, complex, str, unicode)
- else:
- dumpable_dtypes = (bool, int, float, complex, bytes, str)
-
- # Firstly, check if item is a numpy array. If so, just dump it.
- if check_is_ndarray_like(py_obj):
- create_hkl_dataset(py_obj, h_group, call_id, **kwargs)
-
- # next, check if item is iterable
- elif check_is_iterable(py_obj):
- item_type = check_iterable_item_type(py_obj)
-
- # item_type == False implies multiple types. Create a dataset
- if item_type is False:
- h_subgroup = create_hkl_group(py_obj, h_group, call_id)
- for ii, py_subobj in enumerate(py_obj):
- _dump(py_subobj, h_subgroup, call_id=ii, **kwargs)
-
- # otherwise, subitems have same type. Check if subtype is an iterable
- # (e.g. list of lists), or not (e.g. list of ints, which should be treated
- # as a single dataset).
- else:
- if item_type in dumpable_dtypes:
- create_hkl_dataset(py_obj, h_group, call_id, **kwargs)
- else:
- h_subgroup = create_hkl_group(py_obj, h_group, call_id)
- for ii, py_subobj in enumerate(py_obj):
- _dump(py_subobj, h_subgroup, call_id=ii, **kwargs)
-
- # item is not iterable, so create a dataset for it
- else:
- create_hkl_dataset(py_obj, h_group, call_id, **kwargs)
-
-
- def dump(py_obj, file_obj, mode='w', track_times=True, path='/', **kwargs):
- """ Write a pickled representation of obj to the open file object file.
-
- Args:
- obj (object): python object o store in a Hickle
- file: file object, filename string, or h5py.File object
- file in which to store the object. A h5py.File or a filename is also
- acceptable.
- mode (str): optional argument, 'r' (read only), 'w' (write) or 'a' (append).
- Ignored if file is a file object.
- compression (str): optional argument. Applies compression to dataset. Options: None, gzip,
- lzf (+ szip, if installed)
- track_times (bool): optional argument. If set to False, repeated hickling will produce
- identical files.
- path (str): path within hdf5 file to save data to. Defaults to root /
- """
-
- try:
- # Open the file
- h5f = file_opener(file_obj, mode, track_times)
- h5f.attrs[b"CLASS"] = b'hickle'
- h5f.attrs[b"VERSION"] = get_distribution('hickle').version
- h5f.attrs[b"type"] = [b'hickle']
- # Log which version of python was used to generate the hickle file
- pv = sys.version_info
- py_ver = "%i.%i.%i" % (pv[0], pv[1], pv[2])
- h5f.attrs[b"PYTHON_VERSION"] = py_ver
-
- h_root_group = h5f.get(path)
-
- if h_root_group is None:
- h_root_group = h5f.create_group(path)
- h_root_group.attrs[b"type"] = [b'hickle']
-
- _dump(py_obj, h_root_group, **kwargs)
- h5f.close()
- except NoMatchError:
- fname = h5f.filename
- h5f.close()
- try:
- os.remove(fname)
- except OSError:
- warnings.warn("Dump failed. Could not remove %s" % fname)
- finally:
- raise NoMatchError
-
-
- def create_dataset_lookup(py_obj):
- """ What type of object are we trying to pickle? This is a python
- dictionary based equivalent of a case statement. It returns the correct
- helper function for a given data type.
-
- Args:
- py_obj: python object to look-up what function to use to dump to disk
-
- Returns:
- match: function that should be used to dump data to a new dataset
- """
- t = type(py_obj)
- types_lookup = {dict: create_dict_dataset}
- types_lookup.update(types_dict)
-
- match = types_lookup.get(t, no_match)
-
- return match
-
-
-
- def create_hkl_dataset(py_obj, h_group, call_id=0, **kwargs):
- """ Create a dataset within the hickle HDF5 file
-
- Args:
- py_obj: python object to dump.
- h_group (h5.File.group): group to dump data into.
- call_id (int): index to identify object's relative location in the iterable.
-
- """
- #lookup dataset creator type based on python object type
- create_dataset = create_dataset_lookup(py_obj)
-
- # do the creation
- create_dataset(py_obj, h_group, call_id, **kwargs)
-
-
- def create_hkl_group(py_obj, h_group, call_id=0):
- """ Create a new group within the hickle file
-
- Args:
- h_group (h5.File.group): group to dump data into.
- call_id (int): index to identify object's relative location in the iterable.
-
- """
- h_subgroup = h_group.create_group('data_%i' % call_id)
- if six.PY2:
- h_subgroup.attrs["type"] = [str(type(py_obj))]
- else:
- h_subgroup.attrs["type"] = [bytes(str(type(py_obj)), 'ascii')]
- return h_subgroup
-
-
- def create_dict_dataset(py_obj, h_group, call_id=0, **kwargs):
- """ Creates a data group for each key in dictionary
-
- Notes:
- This is a very important function which uses the recursive _dump
- method to build up hierarchical data models stored in the HDF5 file.
- As this is critical to functioning, it is kept in the main hickle.py
- file instead of in the loaders/ directory.
-
- Args:
- py_obj: python object to dump; should be dictionary
- h_group (h5.File.group): group to dump data into.
- call_id (int): index to identify object's relative location in the iterable.
- """
- h_dictgroup = h_group.create_group('data_%i' % call_id)
- h_dictgroup.attrs["type"] = [b'dict']
-
- for key, py_subobj in py_obj.items():
- if six.PY2:
- if type(key) in (unicode, str):
- h_subgroup = h_dictgroup.create_group(key)
- else:
- h_subgroup = h_dictgroup.create_group(str(key))
- else:
- h_subgroup = h_dictgroup.create_group(str(key))
- h_subgroup.attrs["type"] = [b'dict_item']
-
- if six.PY2:
- h_subgroup.attrs["key_type"] = [str(type(key))]
- else:
- tk = str(type(key)).encode('utf-8')
- h_subgroup.attrs["key_type"] = [tk]
-
- _dump(py_subobj, h_subgroup, call_id=0, **kwargs)
-
-
- def no_match(py_obj, h_group, call_id=0, **kwargs):
- """ If no match is made, raise an exception
-
- Args:
- py_obj: python object to dump; default if item is not matched.
- h_group (h5.File.group): group to dump data into.
- call_id (int): index to identify object's relative location in the iterable.
- """
- pickled_obj = pickle.dumps(py_obj)
- d = h_group.create_dataset('data_%i' % call_id, data=[pickled_obj])
- d.attrs["type"] = [b'pickle']
-
- warnings.warn("%s type not understood, data have been serialized" % type(py_obj),
- SerializedWarning)
-
-
-
- #############
- ## LOADERS ##
- #############
-
- class PyContainer(list):
- """ A group-like object into which to load datasets.
-
- In order to build up a tree-like structure, we need to be able
- to load datasets into a container with an append() method.
- Python tuples and sets do not allow this. This class provides
- a list-like object that be converted into a list, tuple, set or dict.
- """
- def __init__(self):
- super(PyContainer, self).__init__()
- self.container_type = None
- self.name = None
- self.key_type = None
-
- def convert(self):
- """ Convert from PyContainer to python core data type.
-
- Returns: self, either as a list, tuple, set or dict
- (or other type specified in lookup.py)
- """
-
- if self.container_type in container_types_dict.keys():
- convert_fn = container_types_dict[self.container_type]
- return convert_fn(self)
- if self.container_type == b"dict":
- keys = []
- for item in self:
- key = item.name.split('/')[-1]
- key_type = item.key_type[0]
- if key_type in container_key_types_dict.keys():
- to_type_fn = container_key_types_dict[key_type]
- key = to_type_fn(key)
- keys.append(key)
-
- items = [item[0] for item in self]
- return dict(zip(keys, items))
- else:
- return self
-
- def no_match_load(key):
- """ If no match is made when loading, need to raise an exception
- """
- raise RuntimeError("Cannot load %s data type" % key)
- #pass
-
- def load_dataset_lookup(key):
- """ What type of object are we trying to unpickle? This is a python
- dictionary based equivalent of a case statement. It returns the type
- a given 'type' keyword in the hickle file.
-
- Args:
- py_obj: python object to look-up what function to use to dump to disk
-
- Returns:
- match: function that should be used to dump data to a new dataset
- """
-
- match = hkl_types_dict.get(key, no_match_load)
-
- return match
-
- def load(fileobj, path='/', safe=True):
- """ Load a hickle file and reconstruct a python object
-
- Args:
- fileobj: file object, h5py.File, or filename string
- safe (bool): Disable automatic depickling of arbitrary python objects.
- DO NOT set this to False unless the file is from a trusted source.
- (see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation)
-
- path (str): path within hdf5 file to save data to. Defaults to root /
- """
-
- try:
- with file_opener(fileobj) as h5f:
- h_root_group = h5f.get(path)
- try:
- assert 'CLASS' in h5f.attrs.keys()
- assert 'VERSION' in h5f.attrs.keys()
- VER = h5f.attrs['VERSION']
- try:
- VER_MAJOR = int(VER)
- except ValueError:
- VER_MAJOR = int(VER[0])
- if VER_MAJOR == 1:
- if six.PY2:
- warnings.warn("Hickle file versioned as V1, attempting legacy loading...")
- from . import hickle_legacy
- return hickle_legacy.load(fileobj, safe)
- else:
- raise RuntimeError("Cannot open file. This file was likely"
- " created with Python 2 and an old hickle version.")
- elif VER_MAJOR == 2:
- if six.PY2:
- warnings.warn("Hickle file appears to be old version (v2), attempting "
- "legacy loading...")
- from . import hickle_legacy2
- return hickle_legacy2.load(fileobj, safe=safe)
- else:
- raise RuntimeError("Cannot open file. This file was likely"
- " created with Python 2 and an old hickle version.")
- # There is an unfortunate period of time where hickle 2.1.0 claims VERSION = int(3)
- # For backward compatibility we really need to catch this.
- # Actual hickle v3 files are versioned as A.B.C (e.g. 3.1.0)
- elif VER_MAJOR == 3 and VER == VER_MAJOR:
- if six.PY2:
- warnings.warn("Hickle file appears to be old version (v2.1.0), attempting "
- "legacy loading...")
- from . import hickle_legacy2
- return hickle_legacy2.load(fileobj, safe=safe)
- else:
- raise RuntimeError("Cannot open file. This file was likely"
- " created with Python 2 and an old hickle version.")
- elif VER_MAJOR >= 3:
- py_container = PyContainer()
- py_container.container_type = 'hickle'
- py_container = _load(py_container, h_root_group)
- return py_container[0][0]
-
- except AssertionError:
- if six.PY2:
- warnings.warn("Hickle file is not versioned, attempting legacy loading...")
- from . import hickle_legacy
- return hickle_legacy.load(fileobj, safe)
- else:
- raise RuntimeError("Cannot open file. This file was likely"
- " created with Python 2 and an old hickle version.")
- finally:
- if 'h5f' in locals():
- # Check if file is open, and if so, close it.
- if h5f.fid.valid:
- h5f.close()
-
- def load_dataset(h_node):
- """ Load a dataset, converting into its correct python type
-
- Args:
- h_node (h5py dataset): h5py dataset object to read
-
- Returns:
- data: reconstructed python object from loaded data
- """
- py_type, data = get_type_and_data(h_node)
-
- try:
- load_fn = load_dataset_lookup(py_type)
- return load_fn(h_node)
- except:
- raise
- #raise RuntimeError("Hickle type %s not understood." % py_type)
-
- def _load(py_container, h_group):
- """ Load a hickle file
-
- Recursive funnction to load hdf5 data into a PyContainer()
-
- Args:
- py_container (PyContainer): Python container to load data into
- h_group (h5 group or dataset): h5py object, group or dataset, to spider
- and load all datasets.
- """
-
- group_dtype = h5._hl.group.Group
- dataset_dtype = h5._hl.dataset.Dataset
-
- #either a file, group, or dataset
- if isinstance(h_group, H5FileWrapper) or isinstance(h_group, group_dtype):
-
- py_subcontainer = PyContainer()
- try:
- py_subcontainer.container_type = bytes(h_group.attrs['type'][0])
- except KeyError:
- raise
- #py_subcontainer.container_type = ''
- py_subcontainer.name = h_group.name
-
- if py_subcontainer.container_type == b'dict_item':
- py_subcontainer.key_type = h_group.attrs['key_type']
-
- if py_subcontainer.container_type not in types_not_to_sort:
- h_keys = sort_keys(h_group.keys())
- else:
- h_keys = h_group.keys()
-
- for h_name in h_keys:
- h_node = h_group[h_name]
- py_subcontainer = _load(py_subcontainer, h_node)
-
- sub_data = py_subcontainer.convert()
- py_container.append(sub_data)
-
- else:
- # must be a dataset
- subdata = load_dataset(h_group)
- py_container.append(subdata)
-
- return py_container
|