laywerrobot/lib/python3.6/site-packages/hickle/hickle_legacy.py
2020-08-27 21:55:39 +02:00

528 lines
15 KiB
Python

# encoding: utf-8
"""
# hickle_legacy.py
Created by Danny Price 2012-05-28.
Hickle is a HDF5 based clone of Pickle. Instead of serializing to a
pickle file, Hickle dumps to a HDF5 file. It is designed to be as similar
to pickle in usage as possible.
## Notes
This is a legacy handler, for hickle v1 files.
If V2 reading fails, this will be called as a fail-over.
"""
import os
import exceptions
import numpy as np
import h5py as h5
from types import NoneType
__version__ = "1.3.0"
__author__ = "Danny Price"
####################
## Error handling ##
####################
class FileError(exceptions.Exception):
""" An exception raised if the file is fishy"""
def __init__(self):
return
def __str__(self):
print("Error: cannot open file. Please pass either a filename string, a file object, or a h5py.File")
class NoMatchError(exceptions.Exception):
""" An exception raised if the object type is not understood (or supported)"""
def __init__(self):
return
def __str__(self):
print("Error: this type of python object cannot be converted into a hickle.")
class ToDoError(exceptions.Exception):
""" An exception raised for non-implemented functionality"""
def __init__(self):
return
def __str__(self):
print("Error: this functionality hasn't been implemented yet.")
class H5GroupWrapper(h5.Group):
def create_dataset(self, *args, **kwargs):
kwargs['track_times'] = getattr(self, 'track_times', True)
return super(H5GroupWrapper, self).create_dataset(*args, **kwargs)
def create_group(self, *args, **kwargs):
group = super(H5GroupWrapper, self).create_group(*args, **kwargs)
group.__class__ = H5GroupWrapper
group.track_times = getattr(self, 'track_times', True)
return group
class H5FileWrapper(h5.File):
def create_dataset(self, *args, **kwargs):
kwargs['track_times'] = getattr(self, 'track_times', True)
return super(H5FileWrapper, self).create_dataset(*args, **kwargs)
def create_group(self, *args, **kwargs):
group = super(H5FileWrapper, self).create_group(*args, **kwargs)
group.__class__ = H5GroupWrapper
group.track_times = getattr(self, 'track_times', True)
return group
def file_opener(f, mode='r', track_times=True):
""" A file opener helper function with some error handling.
This can open files through a file object, a h5py file, or just the filename.
"""
# Were we handed a file object or just a file name string?
if type(f) is file:
filename, mode = f.name, f.mode
f.close()
h5f = h5.File(filename, mode)
elif type(f) is h5._hl.files.File:
h5f = f
elif type(f) is str:
filename = f
h5f = h5.File(filename, mode)
else:
raise FileError
h5f.__class__ = H5FileWrapper
h5f.track_times = track_times
return h5f
#############
## dumpers ##
#############
def dump_ndarray(obj, h5f, **kwargs):
""" dumps an ndarray object to h5py file"""
h5f.create_dataset('data', data=obj, **kwargs)
h5f.create_dataset('type', data=['ndarray'])
def dump_np_dtype(obj, h5f, **kwargs):
""" dumps an np dtype object to h5py file"""
h5f.create_dataset('data', data=obj)
h5f.create_dataset('type', data=['np_dtype'])
def dump_np_dtype_dict(obj, h5f, **kwargs):
""" dumps an np dtype object within a group"""
h5f.create_dataset('data', data=obj)
h5f.create_dataset('_data', data=['np_dtype'])
def dump_masked(obj, h5f, **kwargs):
""" dumps an ndarray object to h5py file"""
h5f.create_dataset('data', data=obj, **kwargs)
h5f.create_dataset('mask', data=obj.mask, **kwargs)
h5f.create_dataset('type', data=['masked'])
def dump_list(obj, h5f, **kwargs):
""" dumps a list object to h5py file"""
# Check if there are any numpy arrays in the list
contains_numpy = any(isinstance(el, np.ndarray) for el in obj)
if contains_numpy:
_dump_list_np(obj, h5f, **kwargs)
else:
h5f.create_dataset('data', data=obj, **kwargs)
h5f.create_dataset('type', data=['list'])
def _dump_list_np(obj, h5f, **kwargs):
""" Dump a list of numpy objects to file """
np_group = h5f.create_group('data')
h5f.create_dataset('type', data=['np_list'])
ii = 0
for np_item in obj:
np_group.create_dataset("%s" % ii, data=np_item, **kwargs)
ii += 1
def dump_tuple(obj, h5f, **kwargs):
""" dumps a list object to h5py file"""
# Check if there are any numpy arrays in the list
contains_numpy = any(isinstance(el, np.ndarray) for el in obj)
if contains_numpy:
_dump_tuple_np(obj, h5f, **kwargs)
else:
h5f.create_dataset('data', data=obj, **kwargs)
h5f.create_dataset('type', data=['tuple'])
def _dump_tuple_np(obj, h5f, **kwargs):
""" Dump a tuple of numpy objects to file """
np_group = h5f.create_group('data')
h5f.create_dataset('type', data=['np_tuple'])
ii = 0
for np_item in obj:
np_group.create_dataset("%s" % ii, data=np_item, **kwargs)
ii += 1
def dump_set(obj, h5f, **kwargs):
""" dumps a set object to h5py file"""
obj = list(obj)
h5f.create_dataset('data', data=obj, **kwargs)
h5f.create_dataset('type', data=['set'])
def dump_string(obj, h5f, **kwargs):
""" dumps a list object to h5py file"""
h5f.create_dataset('data', data=[obj], **kwargs)
h5f.create_dataset('type', data=['string'])
def dump_none(obj, h5f, **kwargs):
""" Dump None type to file """
h5f.create_dataset('data', data=[0], **kwargs)
h5f.create_dataset('type', data=['none'])
def dump_unicode(obj, h5f, **kwargs):
""" dumps a list object to h5py file"""
dt = h5.special_dtype(vlen=unicode)
ll = len(obj)
dset = h5f.create_dataset('data', shape=(ll, ), dtype=dt, **kwargs)
dset[:ll] = obj
h5f.create_dataset('type', data=['unicode'])
def _dump_dict(dd, hgroup, **kwargs):
for key in dd:
if type(dd[key]) in (str, int, float, unicode, bool):
# Figure out type to be stored
types = {str: 'str', int: 'int', float: 'float',
unicode: 'unicode', bool: 'bool', NoneType: 'none'}
_key = types.get(type(dd[key]))
# Store along with dtype info
if _key == 'unicode':
dd[key] = str(dd[key])
hgroup.create_dataset("%s" % key, data=[dd[key]], **kwargs)
hgroup.create_dataset("_%s" % key, data=[_key])
elif type(dd[key]) in (type(np.array([1])), type(np.ma.array([1]))):
if hasattr(dd[key], 'mask'):
hgroup.create_dataset("_%s" % key, data=["masked"])
hgroup.create_dataset("%s" % key, data=dd[key].data, **kwargs)
hgroup.create_dataset("_%s_mask" % key, data=dd[key].mask, **kwargs)
else:
hgroup.create_dataset("_%s" % key, data=["ndarray"])
hgroup.create_dataset("%s" % key, data=dd[key], **kwargs)
elif type(dd[key]) is list:
hgroup.create_dataset("%s" % key, data=dd[key], **kwargs)
hgroup.create_dataset("_%s" % key, data=["list"])
elif type(dd[key]) is tuple:
hgroup.create_dataset("%s" % key, data=dd[key], **kwargs)
hgroup.create_dataset("_%s" % key, data=["tuple"])
elif type(dd[key]) is set:
hgroup.create_dataset("%s" % key, data=list(dd[key]), **kwargs)
hgroup.create_dataset("_%s" % key, data=["set"])
elif isinstance(dd[key], dict):
new_group = hgroup.create_group("%s" % key)
_dump_dict(dd[key], new_group, **kwargs)
elif type(dd[key]) is NoneType:
hgroup.create_dataset("%s" % key, data=[0], **kwargs)
hgroup.create_dataset("_%s" % key, data=["none"])
else:
if type(dd[key]).__module__ == np.__name__:
#print type(dd[key])
hgroup.create_dataset("%s" % key, data=dd[key])
hgroup.create_dataset("_%s" % key, data=["np_dtype"])
#new_group = hgroup.create_group("%s" % key)
#dump_np_dtype_dict(dd[key], new_group)
else:
raise NoMatchError
def dump_dict(obj, h5f='', **kwargs):
""" dumps a dictionary to h5py file """
h5f.create_dataset('type', data=['dict'])
hgroup = h5f.create_group('data')
_dump_dict(obj, hgroup, **kwargs)
def no_match(obj, h5f, *args, **kwargs):
""" If no match is made, raise an exception """
try:
import dill as cPickle
except ImportError:
import cPickle
pickled_obj = cPickle.dumps(obj)
h5f.create_dataset('type', data=['pickle'])
h5f.create_dataset('data', data=[pickled_obj])
print("Warning: %s type not understood, data have been serialized" % type(obj))
#raise NoMatchError
def dumper_lookup(obj):
""" What type of object are we trying to pickle?
This is a python dictionary based equivalent of a case statement.
It returns the correct helper function for a given data type.
"""
t = type(obj)
types = {
list: dump_list,
tuple: dump_tuple,
set: dump_set,
dict: dump_dict,
str: dump_string,
unicode: dump_unicode,
NoneType: dump_none,
np.ndarray: dump_ndarray,
np.ma.core.MaskedArray: dump_masked,
np.float16: dump_np_dtype,
np.float32: dump_np_dtype,
np.float64: dump_np_dtype,
np.int8: dump_np_dtype,
np.int16: dump_np_dtype,
np.int32: dump_np_dtype,
np.int64: dump_np_dtype,
np.uint8: dump_np_dtype,
np.uint16: dump_np_dtype,
np.uint32: dump_np_dtype,
np.uint64: dump_np_dtype,
np.complex64: dump_np_dtype,
np.complex128: dump_np_dtype,
}
match = types.get(t, no_match)
return match
def dump(obj, file, mode='w', track_times=True, **kwargs):
""" Write a pickled representation of obj to the open file object file.
Parameters
----------
obj: object
python object o store in a Hickle
file: file object, filename string, or h5py.File object
file in which to store the object. A h5py.File or a filename is also acceptable.
mode: string
optional argument, 'r' (read only), 'w' (write) or 'a' (append). Ignored if file is a file object.
compression: str
optional argument. Applies compression to dataset. Options: None, gzip, lzf (+ szip, if installed)
track_times: bool
optional argument. If set to False, repeated hickling will produce identical files.
"""
try:
# See what kind of object to dump
dumper = dumper_lookup(obj)
# Open the file
h5f = file_opener(file, mode, track_times)
print("dumping %s to file %s" % (type(obj), repr(h5f)))
dumper(obj, h5f, **kwargs)
h5f.close()
except NoMatchError:
fname = h5f.filename
h5f.close()
try:
os.remove(fname)
except:
print("Warning: dump failed. Could not remove %s" % fname)
finally:
raise NoMatchError
#############
## loaders ##
#############
def load(file, safe=True):
""" Load a hickle file and reconstruct a python object
Parameters
----------
file: file object, h5py.File, or filename string
safe (bool): Disable automatic depickling of arbitrary python objects.
DO NOT set this to False unless the file is from a trusted source.
(see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation)
"""
try:
h5f = file_opener(file)
dtype = h5f["type"][0]
if dtype == 'dict':
group = h5f["data"]
data = load_dict(group)
elif dtype == 'pickle':
data = load_pickle(h5f, safe)
elif dtype == 'np_list':
group = h5f["data"]
data = load_np_list(group)
elif dtype == 'np_tuple':
group = h5f["data"]
data = load_np_tuple(group)
elif dtype == 'masked':
data = np.ma.array(h5f["data"][:], mask=h5f["mask"][:])
elif dtype == 'none':
data = None
else:
if dtype in ('string', 'unicode'):
data = h5f["data"][0]
else:
try:
data = h5f["data"][:]
except ValueError:
data = h5f["data"]
types = {
'list': list,
'set': set,
'unicode': unicode,
'string': str,
'ndarray': load_ndarray,
'np_dtype': load_np_dtype
}
mod = types.get(dtype, no_match)
data = mod(data)
finally:
if 'h5f' in locals():
h5f.close()
return data
def load_pickle(h5f, safe=True):
""" Deserialize and load a pickled object within a hickle file
WARNING: Pickle has
Parameters
----------
h5f: h5py.File object
safe (bool): Disable automatic depickling of arbitrary python objects.
DO NOT set this to False unless the file is from a trusted source.
(see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation)
"""
if not safe:
try:
import dill as cPickle
except ImportError:
import cPickle
data = h5f["data"][:]
data = cPickle.loads(data[0])
return data
else:
print("\nWarning: Object is of an unknown type, and has not been loaded")
print(" for security reasons (it could be malicious code). If")
print(" you wish to continue, manually set safe=False\n")
def load_np_list(group):
""" load a numpy list """
np_list = []
for key in sorted(group.keys()):
data = group[key][:]
np_list.append(data)
return np_list
def load_np_tuple(group):
""" load a tuple containing numpy arrays """
return tuple(load_np_list(group))
def load_ndarray(arr):
""" Load a numpy array """
# Nothing to be done!
return arr
def load_np_dtype(arr):
""" Load a numpy array """
# Just return first value
return arr.value
def load_dict(group):
""" Load dictionary """
dd = {}
for key in group.keys():
if isinstance(group[key], h5._hl.group.Group):
new_group = group[key]
dd[key] = load_dict(new_group)
elif not key.startswith("_"):
_key = "_%s" % key
if group[_key][0] == 'np_dtype':
dd[key] = group[key].value
elif group[_key][0] in ('str', 'int', 'float', 'unicode', 'bool'):
dd[key] = group[key][0]
elif group[_key][0] == 'masked':
key_ma = "_%s_mask" % key
dd[key] = np.ma.array(group[key][:], mask=group[key_ma])
else:
dd[key] = group[key][:]
# Convert numpy constructs back to string
dtype = group[_key][0]
types = {'str': str, 'int': int, 'float': float,
'unicode': unicode, 'bool': bool, 'list': list, 'none' : NoneType}
try:
mod = types.get(dtype)
if dtype == 'none':
dd[key] = None
else:
dd[key] = mod(dd[key])
except:
pass
return dd
def load_large(file):
""" Load a large hickle file (returns the h5py object not the data)
Parameters
----------
file: file object, h5py.File, or filename string
"""
h5f = file_opener(file)
return h5f