# encoding: utf-8 """ # hickle_legacy.py Created by Danny Price 2012-05-28. Hickle is a HDF5 based clone of Pickle. Instead of serializing to a pickle file, Hickle dumps to a HDF5 file. It is designed to be as similar to pickle in usage as possible. ## Notes This is a legacy handler, for hickle v1 files. If V2 reading fails, this will be called as a fail-over. """ import os import exceptions import numpy as np import h5py as h5 from types import NoneType __version__ = "1.3.0" __author__ = "Danny Price" #################### ## Error handling ## #################### class FileError(exceptions.Exception): """ An exception raised if the file is fishy""" def __init__(self): return def __str__(self): print("Error: cannot open file. Please pass either a filename string, a file object, or a h5py.File") class NoMatchError(exceptions.Exception): """ An exception raised if the object type is not understood (or supported)""" def __init__(self): return def __str__(self): print("Error: this type of python object cannot be converted into a hickle.") class ToDoError(exceptions.Exception): """ An exception raised for non-implemented functionality""" def __init__(self): return def __str__(self): print("Error: this functionality hasn't been implemented yet.") class H5GroupWrapper(h5.Group): def create_dataset(self, *args, **kwargs): kwargs['track_times'] = getattr(self, 'track_times', True) return super(H5GroupWrapper, self).create_dataset(*args, **kwargs) def create_group(self, *args, **kwargs): group = super(H5GroupWrapper, self).create_group(*args, **kwargs) group.__class__ = H5GroupWrapper group.track_times = getattr(self, 'track_times', True) return group class H5FileWrapper(h5.File): def create_dataset(self, *args, **kwargs): kwargs['track_times'] = getattr(self, 'track_times', True) return super(H5FileWrapper, self).create_dataset(*args, **kwargs) def create_group(self, *args, **kwargs): group = super(H5FileWrapper, self).create_group(*args, **kwargs) group.__class__ = H5GroupWrapper group.track_times = getattr(self, 'track_times', True) return group def file_opener(f, mode='r', track_times=True): """ A file opener helper function with some error handling. This can open files through a file object, a h5py file, or just the filename. """ # Were we handed a file object or just a file name string? if type(f) is file: filename, mode = f.name, f.mode f.close() h5f = h5.File(filename, mode) elif type(f) is h5._hl.files.File: h5f = f elif type(f) is str: filename = f h5f = h5.File(filename, mode) else: raise FileError h5f.__class__ = H5FileWrapper h5f.track_times = track_times return h5f ############# ## dumpers ## ############# def dump_ndarray(obj, h5f, **kwargs): """ dumps an ndarray object to h5py file""" h5f.create_dataset('data', data=obj, **kwargs) h5f.create_dataset('type', data=['ndarray']) def dump_np_dtype(obj, h5f, **kwargs): """ dumps an np dtype object to h5py file""" h5f.create_dataset('data', data=obj) h5f.create_dataset('type', data=['np_dtype']) def dump_np_dtype_dict(obj, h5f, **kwargs): """ dumps an np dtype object within a group""" h5f.create_dataset('data', data=obj) h5f.create_dataset('_data', data=['np_dtype']) def dump_masked(obj, h5f, **kwargs): """ dumps an ndarray object to h5py file""" h5f.create_dataset('data', data=obj, **kwargs) h5f.create_dataset('mask', data=obj.mask, **kwargs) h5f.create_dataset('type', data=['masked']) def dump_list(obj, h5f, **kwargs): """ dumps a list object to h5py file""" # Check if there are any numpy arrays in the list contains_numpy = any(isinstance(el, np.ndarray) for el in obj) if contains_numpy: _dump_list_np(obj, h5f, **kwargs) else: h5f.create_dataset('data', data=obj, **kwargs) h5f.create_dataset('type', data=['list']) def _dump_list_np(obj, h5f, **kwargs): """ Dump a list of numpy objects to file """ np_group = h5f.create_group('data') h5f.create_dataset('type', data=['np_list']) ii = 0 for np_item in obj: np_group.create_dataset("%s" % ii, data=np_item, **kwargs) ii += 1 def dump_tuple(obj, h5f, **kwargs): """ dumps a list object to h5py file""" # Check if there are any numpy arrays in the list contains_numpy = any(isinstance(el, np.ndarray) for el in obj) if contains_numpy: _dump_tuple_np(obj, h5f, **kwargs) else: h5f.create_dataset('data', data=obj, **kwargs) h5f.create_dataset('type', data=['tuple']) def _dump_tuple_np(obj, h5f, **kwargs): """ Dump a tuple of numpy objects to file """ np_group = h5f.create_group('data') h5f.create_dataset('type', data=['np_tuple']) ii = 0 for np_item in obj: np_group.create_dataset("%s" % ii, data=np_item, **kwargs) ii += 1 def dump_set(obj, h5f, **kwargs): """ dumps a set object to h5py file""" obj = list(obj) h5f.create_dataset('data', data=obj, **kwargs) h5f.create_dataset('type', data=['set']) def dump_string(obj, h5f, **kwargs): """ dumps a list object to h5py file""" h5f.create_dataset('data', data=[obj], **kwargs) h5f.create_dataset('type', data=['string']) def dump_none(obj, h5f, **kwargs): """ Dump None type to file """ h5f.create_dataset('data', data=[0], **kwargs) h5f.create_dataset('type', data=['none']) def dump_unicode(obj, h5f, **kwargs): """ dumps a list object to h5py file""" dt = h5.special_dtype(vlen=unicode) ll = len(obj) dset = h5f.create_dataset('data', shape=(ll, ), dtype=dt, **kwargs) dset[:ll] = obj h5f.create_dataset('type', data=['unicode']) def _dump_dict(dd, hgroup, **kwargs): for key in dd: if type(dd[key]) in (str, int, float, unicode, bool): # Figure out type to be stored types = {str: 'str', int: 'int', float: 'float', unicode: 'unicode', bool: 'bool', NoneType: 'none'} _key = types.get(type(dd[key])) # Store along with dtype info if _key == 'unicode': dd[key] = str(dd[key]) hgroup.create_dataset("%s" % key, data=[dd[key]], **kwargs) hgroup.create_dataset("_%s" % key, data=[_key]) elif type(dd[key]) in (type(np.array([1])), type(np.ma.array([1]))): if hasattr(dd[key], 'mask'): hgroup.create_dataset("_%s" % key, data=["masked"]) hgroup.create_dataset("%s" % key, data=dd[key].data, **kwargs) hgroup.create_dataset("_%s_mask" % key, data=dd[key].mask, **kwargs) else: hgroup.create_dataset("_%s" % key, data=["ndarray"]) hgroup.create_dataset("%s" % key, data=dd[key], **kwargs) elif type(dd[key]) is list: hgroup.create_dataset("%s" % key, data=dd[key], **kwargs) hgroup.create_dataset("_%s" % key, data=["list"]) elif type(dd[key]) is tuple: hgroup.create_dataset("%s" % key, data=dd[key], **kwargs) hgroup.create_dataset("_%s" % key, data=["tuple"]) elif type(dd[key]) is set: hgroup.create_dataset("%s" % key, data=list(dd[key]), **kwargs) hgroup.create_dataset("_%s" % key, data=["set"]) elif isinstance(dd[key], dict): new_group = hgroup.create_group("%s" % key) _dump_dict(dd[key], new_group, **kwargs) elif type(dd[key]) is NoneType: hgroup.create_dataset("%s" % key, data=[0], **kwargs) hgroup.create_dataset("_%s" % key, data=["none"]) else: if type(dd[key]).__module__ == np.__name__: #print type(dd[key]) hgroup.create_dataset("%s" % key, data=dd[key]) hgroup.create_dataset("_%s" % key, data=["np_dtype"]) #new_group = hgroup.create_group("%s" % key) #dump_np_dtype_dict(dd[key], new_group) else: raise NoMatchError def dump_dict(obj, h5f='', **kwargs): """ dumps a dictionary to h5py file """ h5f.create_dataset('type', data=['dict']) hgroup = h5f.create_group('data') _dump_dict(obj, hgroup, **kwargs) def no_match(obj, h5f, *args, **kwargs): """ If no match is made, raise an exception """ try: import dill as cPickle except ImportError: import cPickle pickled_obj = cPickle.dumps(obj) h5f.create_dataset('type', data=['pickle']) h5f.create_dataset('data', data=[pickled_obj]) print("Warning: %s type not understood, data have been serialized" % type(obj)) #raise NoMatchError def dumper_lookup(obj): """ What type of object are we trying to pickle? This is a python dictionary based equivalent of a case statement. It returns the correct helper function for a given data type. """ t = type(obj) types = { list: dump_list, tuple: dump_tuple, set: dump_set, dict: dump_dict, str: dump_string, unicode: dump_unicode, NoneType: dump_none, np.ndarray: dump_ndarray, np.ma.core.MaskedArray: dump_masked, np.float16: dump_np_dtype, np.float32: dump_np_dtype, np.float64: dump_np_dtype, np.int8: dump_np_dtype, np.int16: dump_np_dtype, np.int32: dump_np_dtype, np.int64: dump_np_dtype, np.uint8: dump_np_dtype, np.uint16: dump_np_dtype, np.uint32: dump_np_dtype, np.uint64: dump_np_dtype, np.complex64: dump_np_dtype, np.complex128: dump_np_dtype, } match = types.get(t, no_match) return match def dump(obj, file, mode='w', track_times=True, **kwargs): """ Write a pickled representation of obj to the open file object file. Parameters ---------- obj: object python object o store in a Hickle file: file object, filename string, or h5py.File object file in which to store the object. A h5py.File or a filename is also acceptable. mode: string optional argument, 'r' (read only), 'w' (write) or 'a' (append). Ignored if file is a file object. compression: str optional argument. Applies compression to dataset. Options: None, gzip, lzf (+ szip, if installed) track_times: bool optional argument. If set to False, repeated hickling will produce identical files. """ try: # See what kind of object to dump dumper = dumper_lookup(obj) # Open the file h5f = file_opener(file, mode, track_times) print("dumping %s to file %s" % (type(obj), repr(h5f))) dumper(obj, h5f, **kwargs) h5f.close() except NoMatchError: fname = h5f.filename h5f.close() try: os.remove(fname) except: print("Warning: dump failed. Could not remove %s" % fname) finally: raise NoMatchError ############# ## loaders ## ############# def load(file, safe=True): """ Load a hickle file and reconstruct a python object Parameters ---------- file: file object, h5py.File, or filename string safe (bool): Disable automatic depickling of arbitrary python objects. DO NOT set this to False unless the file is from a trusted source. (see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation) """ try: h5f = file_opener(file) dtype = h5f["type"][0] if dtype == 'dict': group = h5f["data"] data = load_dict(group) elif dtype == 'pickle': data = load_pickle(h5f, safe) elif dtype == 'np_list': group = h5f["data"] data = load_np_list(group) elif dtype == 'np_tuple': group = h5f["data"] data = load_np_tuple(group) elif dtype == 'masked': data = np.ma.array(h5f["data"][:], mask=h5f["mask"][:]) elif dtype == 'none': data = None else: if dtype in ('string', 'unicode'): data = h5f["data"][0] else: try: data = h5f["data"][:] except ValueError: data = h5f["data"] types = { 'list': list, 'set': set, 'unicode': unicode, 'string': str, 'ndarray': load_ndarray, 'np_dtype': load_np_dtype } mod = types.get(dtype, no_match) data = mod(data) finally: if 'h5f' in locals(): h5f.close() return data def load_pickle(h5f, safe=True): """ Deserialize and load a pickled object within a hickle file WARNING: Pickle has Parameters ---------- h5f: h5py.File object safe (bool): Disable automatic depickling of arbitrary python objects. DO NOT set this to False unless the file is from a trusted source. (see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation) """ if not safe: try: import dill as cPickle except ImportError: import cPickle data = h5f["data"][:] data = cPickle.loads(data[0]) return data else: print("\nWarning: Object is of an unknown type, and has not been loaded") print(" for security reasons (it could be malicious code). If") print(" you wish to continue, manually set safe=False\n") def load_np_list(group): """ load a numpy list """ np_list = [] for key in sorted(group.keys()): data = group[key][:] np_list.append(data) return np_list def load_np_tuple(group): """ load a tuple containing numpy arrays """ return tuple(load_np_list(group)) def load_ndarray(arr): """ Load a numpy array """ # Nothing to be done! return arr def load_np_dtype(arr): """ Load a numpy array """ # Just return first value return arr.value def load_dict(group): """ Load dictionary """ dd = {} for key in group.keys(): if isinstance(group[key], h5._hl.group.Group): new_group = group[key] dd[key] = load_dict(new_group) elif not key.startswith("_"): _key = "_%s" % key if group[_key][0] == 'np_dtype': dd[key] = group[key].value elif group[_key][0] in ('str', 'int', 'float', 'unicode', 'bool'): dd[key] = group[key][0] elif group[_key][0] == 'masked': key_ma = "_%s_mask" % key dd[key] = np.ma.array(group[key][:], mask=group[key_ma]) else: dd[key] = group[key][:] # Convert numpy constructs back to string dtype = group[_key][0] types = {'str': str, 'int': int, 'float': float, 'unicode': unicode, 'bool': bool, 'list': list, 'none' : NoneType} try: mod = types.get(dtype) if dtype == 'none': dd[key] = None else: dd[key] = mod(dd[key]) except: pass return dd def load_large(file): """ Load a large hickle file (returns the h5py object not the data) Parameters ---------- file: file object, h5py.File, or filename string """ h5f = file_opener(file) return h5f