alpcentaur
/
basabuuka_prototyp


								# encoding: utf-8

								"""

								# hickle.py


								Created by Danny Price 2016-02-03.


								Hickle is a HDF5 based clone of Pickle. Instead of serializing to a pickle

								file, Hickle dumps to a HDF5 file. It is designed to be as similar to pickle in

								usage as possible, providing a load() and dump() function.


								## Notes


								Hickle has two main advantages over Pickle:

								1) LARGE PICKLE HANDLING. Unpickling a large pickle is slow, as the Unpickler

								reads the entire pickle thing and loads it into memory. In comparison, HDF5

								files are designed for large datasets. Things are only loaded when accessed.


								2) CROSS PLATFORM SUPPORT. Attempting to unpickle a pickle pickled on Windows

								on Linux and vice versa is likely to fail with errors like "Insecure string

								pickle". HDF5 files will load fine, as long as both machines have

								h5py installed.


								"""


								from __future__ import absolute_import

								import sys

								import os

								from pkg_resources import get_distribution


								import numpy as np

								import h5py as h5


								from .helpers import get_type_and_data, sort_keys, check_is_iterable, check_iterable_item_type

								from .lookup import types_dict, hkl_types_dict, types_not_to_sort, \

								    container_types_dict, container_key_types_dict

								from .lookup import check_is_ndarray_like


								try:

								    from exceptions import Exception

								    from types import NoneType

								except ImportError:

								    pass        # above imports will fail in python3


								import six

								import io


								# Import a default 'pickler'

								# Not the nicest import code, but should work on Py2/Py3

								try:

								    import dill as pickle

								except ImportError:

								    try:

								        import cPickle as pickle

								    except ImportError:

								        import pickle

								    except ModuleNotFoundError:

								        import pickle

								except ModuleNotFoundError:

								    import pickle


								import warnings


								from pkg_resources import get_distribution, DistributionNotFound

								try:

								    __version__ = get_distribution('hickle').version

								except DistributionNotFound:

								    __version__ = '0.0.0 - please install via pip/setup.py'


								##################

								# Error handling #

								##################


								class FileError(Exception):

								    """ An exception raised if the file is fishy """

								    def __init__(self):

								        return


								    def __str__(self):

								        return ("Cannot open file. Please pass either a filename "

								                "string, a file object, or a h5py.File")


								class ClosedFileError(Exception):

								    """ An exception raised if the file is fishy """

								    def __init__(self):

								        return


								    def __str__(self):

								        return ("HDF5 file has been closed. Please pass either "

								                "a filename string, a file object, or an open h5py.File")


								class NoMatchError(Exception):

								    """ An exception raised if the object type is not understood (or

								    supported)"""

								    def __init__(self):

								        return


								    def __str__(self):

								        return ("Error: this type of python object cannot be converted into a "

								                "hickle.")


								class ToDoError(Exception):

								    """ An exception raised for non-implemented functionality"""

								    def __init__(self):

								        return


								    def __str__(self):

								        return "Error: this functionality hasn't been implemented yet."


								class SerializedWarning(UserWarning):

								    """ An object type was not understood


								    The data will be serialized using pickle.

								    """

								    pass


								######################

								# H5PY file wrappers #

								######################


								class H5GroupWrapper(h5.Group):

								    """ Group wrapper that provides a track_times kwarg.


								    track_times is a boolean flag that can be set to False, so that two

								    files created at different times will have identical MD5 hashes.

								    """

								    def create_dataset(self, *args, **kwargs):

								        kwargs['track_times'] = getattr(self, 'track_times', True)

								        return super(H5GroupWrapper, self).create_dataset(*args, **kwargs)


								    def create_group(self, *args, **kwargs):

								        group = super(H5GroupWrapper, self).create_group(*args, **kwargs)

								        group.__class__ = H5GroupWrapper

								        group.track_times = getattr(self, 'track_times', True)

								        return group


								class H5FileWrapper(h5.File):

								    """ Wrapper for h5py File that provides a track_times kwarg.


								    track_times is a boolean flag that can be set to False, so that two

								    files created at different times will have identical MD5 hashes.

								    """

								    def create_dataset(self, *args, **kwargs):

								        kwargs['track_times'] = getattr(self, 'track_times', True)

								        return super(H5FileWrapper, self).create_dataset(*args, **kwargs)


								    def create_group(self, *args, **kwargs):

								        group = super(H5FileWrapper, self).create_group(*args, **kwargs)

								        group.__class__ = H5GroupWrapper

								        group.track_times = getattr(self, 'track_times', True)

								        return group


								def file_opener(f, mode='r', track_times=True):

								    """ A file opener helper function with some error handling.  This can open

								    files through a file object, a h5py file, or just the filename.


								    Args:

								        f (file, h5py.File, or string): File-identifier, e.g. filename or file object.

								        mode (str): File open mode. Only required if opening by filename string.

								        track_times (bool): Track time in HDF5; turn off if you want hickling at

								                 different times to produce identical files (e.g. for MD5 hash check).


								    """


								    # Were we handed a file object or just a file name string?

								    if six.PY2:

								        if isinstance(f, file):

								            filename, mode = f.name, f.mode

								            f.close()

								            h5f = h5.File(filename, mode)

								        elif isinstance(f, str) or isinstance(f, unicode):

								            filename = f

								            h5f = h5.File(filename, mode)

								        elif isinstance(f, H5FileWrapper) or isinstance(f, h5._hl.files.File):

								            try:

								                filename = f.filename

								            except ValueError:

								                raise ClosedFileError()

								            h5f = f

								        else:

								            print(type(f))

								            raise FileError


								    else:

								        if isinstance(f, io.TextIOWrapper):

								            filename, mode = f.name, f.mode

								            f.close()

								            h5f = h5.File(filename, mode)

								        elif isinstance(f, str) or isinstance(f, bytes):

								            filename = f

								            h5f = h5.File(filename, mode)

								        elif isinstance(f, H5FileWrapper) or isinstance(f, h5._hl.files.File):

								            try:

								                filename = f.filename

								            except ValueError:

								                raise ClosedFileError()

								            h5f = f

								        else:

								            print(type(f))

								            raise FileError


								    h5f.__class__ = H5FileWrapper

								    h5f.track_times = track_times

								    return h5f


								###########

								# DUMPERS #

								###########


								def _dump(py_obj, h_group, call_id=0, **kwargs):

								    """ Dump a python object to a group within a HDF5 file.


								    This function is called recursively by the main dump() function.


								    Args:

								        py_obj: python object to dump.

								        h_group (h5.File.group): group to dump data into.

								        call_id (int): index to identify object's relative location in the iterable.

								    """


								    if six.PY2:

								        dumpable_dtypes = (bool, int, float, long, complex, str, unicode)

								    else:

								        dumpable_dtypes = (bool, int, float, complex, bytes, str)


								    # Firstly, check if item is a numpy array. If so, just dump it.

								    if check_is_ndarray_like(py_obj):

								        create_hkl_dataset(py_obj, h_group, call_id, **kwargs)


								    # next, check if item is iterable

								    elif check_is_iterable(py_obj):

								        item_type = check_iterable_item_type(py_obj)


								        # item_type == False implies multiple types. Create a dataset

								        if item_type is False:

								            h_subgroup = create_hkl_group(py_obj, h_group, call_id)

								            for ii, py_subobj in enumerate(py_obj):

								                _dump(py_subobj, h_subgroup, call_id=ii, **kwargs)


								        # otherwise, subitems have same type. Check if subtype is an iterable

								        # (e.g. list of lists), or not (e.g. list of ints, which should be treated

								        # as a single dataset).

								        else:

								            if item_type in dumpable_dtypes:

								                create_hkl_dataset(py_obj, h_group, call_id, **kwargs)

								            else:

								                h_subgroup = create_hkl_group(py_obj, h_group, call_id)

								                for ii, py_subobj in enumerate(py_obj):

								                    _dump(py_subobj, h_subgroup, call_id=ii, **kwargs)


								    # item is not iterable, so create a dataset for it

								    else:

								        create_hkl_dataset(py_obj, h_group, call_id, **kwargs)


								def dump(py_obj, file_obj, mode='w', track_times=True, path='/', **kwargs):

								    """ Write a pickled representation of obj to the open file object file.


								    Args:

								    obj (object): python object o store in a Hickle

								    file: file object, filename string, or h5py.File object

								            file in which to store the object. A h5py.File or a filename is also

								            acceptable.

								    mode (str): optional argument, 'r' (read only), 'w' (write) or 'a' (append).

								            Ignored if file is a file object.

								    compression (str): optional argument. Applies compression to dataset. Options: None, gzip,

								            lzf (+ szip, if installed)

								    track_times (bool): optional argument. If set to False, repeated hickling will produce

								            identical files.

								    path (str): path within hdf5 file to save data to. Defaults to root /

								    """


								    try:

								        # Open the file

								        h5f = file_opener(file_obj, mode, track_times)

								        h5f.attrs[b"CLASS"] = b'hickle'

								        h5f.attrs[b"VERSION"] = get_distribution('hickle').version

								        h5f.attrs[b"type"] = [b'hickle']

								        # Log which version of python was used to generate the hickle file

								        pv = sys.version_info

								        py_ver = "%i.%i.%i" % (pv[0], pv[1], pv[2])

								        h5f.attrs[b"PYTHON_VERSION"] = py_ver


								        h_root_group = h5f.get(path)


								        if h_root_group is None:

								            h_root_group = h5f.create_group(path)

								            h_root_group.attrs[b"type"] = [b'hickle']


								        _dump(py_obj, h_root_group, **kwargs)

								        h5f.close()

								    except NoMatchError:

								        fname = h5f.filename

								        h5f.close()

								        try:

								            os.remove(fname)

								        except OSError:

								            warnings.warn("Dump failed. Could not remove %s" % fname)

								        finally:

								            raise NoMatchError


								def create_dataset_lookup(py_obj):

								    """ What type of object are we trying to pickle?  This is a python

								    dictionary based equivalent of a case statement.  It returns the correct

								    helper function for a given data type.


								    Args:

								        py_obj: python object to look-up what function to use to dump to disk


								    Returns:

								        match: function that should be used to dump data to a new dataset

								    """

								    t = type(py_obj)

								    types_lookup = {dict: create_dict_dataset}

								    types_lookup.update(types_dict)


								    match = types_lookup.get(t, no_match)


								    return match


								def create_hkl_dataset(py_obj, h_group, call_id=0, **kwargs):

								    """ Create a dataset within the hickle HDF5 file


								    Args:

								        py_obj: python object to dump.

								        h_group (h5.File.group): group to dump data into.

								        call_id (int): index to identify object's relative location in the iterable.


								    """

								    #lookup dataset creator type based on python object type

								    create_dataset = create_dataset_lookup(py_obj)


								    # do the creation

								    create_dataset(py_obj, h_group, call_id, **kwargs)


								def create_hkl_group(py_obj, h_group, call_id=0):

								    """ Create a new group within the hickle file


								    Args:

								        h_group (h5.File.group): group to dump data into.

								        call_id (int): index to identify object's relative location in the iterable.


								    """

								    h_subgroup = h_group.create_group('data_%i' % call_id)

								    if six.PY2:

								        h_subgroup.attrs["type"] = [str(type(py_obj))]

								    else:

								        h_subgroup.attrs["type"] = [bytes(str(type(py_obj)), 'ascii')]

								    return h_subgroup


								def create_dict_dataset(py_obj, h_group, call_id=0, **kwargs):

								    """ Creates a data group for each key in dictionary


								    Notes:

								        This is a very important function which uses the recursive _dump

								        method to build up hierarchical data models stored in the HDF5 file.

								        As this is critical to functioning, it is kept in the main hickle.py

								        file instead of in the loaders/ directory.


								    Args:

								        py_obj: python object to dump; should be dictionary

								        h_group (h5.File.group): group to dump data into.

								        call_id (int): index to identify object's relative location in the iterable.

								    """

								    h_dictgroup = h_group.create_group('data_%i' % call_id)

								    h_dictgroup.attrs["type"] = [b'dict']


								    for key, py_subobj in py_obj.items():

								        if six.PY2:

								            if type(key) in (unicode, str):

								                h_subgroup = h_dictgroup.create_group(key)

								            else:

								                h_subgroup = h_dictgroup.create_group(str(key))

								        else:

								            h_subgroup = h_dictgroup.create_group(str(key))

								        h_subgroup.attrs["type"] = [b'dict_item']


								        if six.PY2:

								            h_subgroup.attrs["key_type"] = [str(type(key))]

								        else:

								            tk = str(type(key)).encode('utf-8')

								            h_subgroup.attrs["key_type"] = [tk]


								        _dump(py_subobj, h_subgroup, call_id=0, **kwargs)


								def no_match(py_obj, h_group, call_id=0, **kwargs):

								    """ If no match is made, raise an exception


								    Args:

								        py_obj: python object to dump; default if item is not matched.

								        h_group (h5.File.group): group to dump data into.

								        call_id (int): index to identify object's relative location in the iterable.

								    """

								    pickled_obj = pickle.dumps(py_obj)

								    d = h_group.create_dataset('data_%i' % call_id, data=[pickled_obj])

								    d.attrs["type"] = [b'pickle']


								    warnings.warn("%s type not understood, data have been serialized" % type(py_obj),

								                  SerializedWarning)


								#############

								## LOADERS ##

								#############


								class PyContainer(list):

								    """ A group-like object into which to load datasets.


								    In order to build up a tree-like structure, we need to be able

								    to load datasets into a container with an append() method.

								    Python tuples and sets do not allow this. This class provides

								    a list-like object that be converted into a list, tuple, set or dict.

								    """

								    def __init__(self):

								        super(PyContainer, self).__init__()

								        self.container_type = None

								        self.name = None

								        self.key_type = None


								    def convert(self):

								        """ Convert from PyContainer to python core data type.


								        Returns: self, either as a list, tuple, set or dict

								                 (or other type specified in lookup.py)

								        """


								        if self.container_type in container_types_dict.keys():

								            convert_fn = container_types_dict[self.container_type]

								            return convert_fn(self)

								        if self.container_type == b"dict":

								            keys = []

								            for item in self:

								                key = item.name.split('/')[-1]

								                key_type = item.key_type[0]

								                if key_type in container_key_types_dict.keys():

								                    to_type_fn = container_key_types_dict[key_type]

								                    key = to_type_fn(key)

								                keys.append(key)


								            items = [item[0] for item in self]

								            return dict(zip(keys, items))

								        else:

								            return self


								def no_match_load(key):

								    """ If no match is made when loading, need to raise an exception

								    """

								    raise RuntimeError("Cannot load %s data type" % key)

								    #pass


								def load_dataset_lookup(key):

								    """ What type of object are we trying to unpickle?  This is a python

								    dictionary based equivalent of a case statement.  It returns the type

								    a given 'type' keyword in the hickle file.


								    Args:

								        py_obj: python object to look-up what function to use to dump to disk


								    Returns:

								        match: function that should be used to dump data to a new dataset

								    """


								    match = hkl_types_dict.get(key, no_match_load)


								    return match


								def load(fileobj, path='/', safe=True):

								    """ Load a hickle file and reconstruct a python object


								    Args:

								        fileobj: file object, h5py.File, or filename string

								            safe (bool): Disable automatic depickling of arbitrary python objects.

								            DO NOT set this to False unless the file is from a trusted source.

								            (see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation)


								        path (str): path within hdf5 file to save data to. Defaults to root /

								    """


								    try:

								        with file_opener(fileobj) as h5f:

								            h_root_group = h5f.get(path)

								            try:

								                assert 'CLASS' in h5f.attrs.keys()

								                assert 'VERSION' in h5f.attrs.keys()

								                VER = h5f.attrs['VERSION']

								                try:

								                    VER_MAJOR = int(VER)

								                except ValueError:

								                    VER_MAJOR = int(VER[0])

								                if VER_MAJOR == 1:

								                    if six.PY2:

								                        warnings.warn("Hickle file versioned as V1, attempting legacy loading...")

								                        from . import hickle_legacy

								                        return hickle_legacy.load(fileobj, safe)

								                    else:

								                        raise RuntimeError("Cannot open file. This file was likely"

								                                           " created with Python 2 and an old hickle version.")

								                elif VER_MAJOR == 2:

								                    if six.PY2:

								                        warnings.warn("Hickle file appears to be old version (v2), attempting "

								                                      "legacy loading...")

								                        from . import hickle_legacy2

								                        return hickle_legacy2.load(fileobj, safe=safe)

								                    else:

								                        raise RuntimeError("Cannot open file. This file was likely"

								                                           " created with Python 2 and an old hickle version.")

								                # There is an unfortunate period of time where hickle 2.1.0 claims VERSION = int(3)

								                # For backward compatibility we really need to catch this.

								                # Actual hickle v3 files are versioned as A.B.C (e.g. 3.1.0)

								                elif VER_MAJOR == 3 and VER == VER_MAJOR:

								                    if six.PY2:

								                        warnings.warn("Hickle file appears to be old version (v2.1.0), attempting "

								                                      "legacy loading...")

								                        from . import hickle_legacy2

								                        return hickle_legacy2.load(fileobj, safe=safe)

								                    else:

								                        raise RuntimeError("Cannot open file. This file was likely"

								                                           " created with Python 2 and an old hickle version.")

								                elif VER_MAJOR >= 3:

								                    py_container = PyContainer()

								                    py_container.container_type = 'hickle'

								                    py_container = _load(py_container, h_root_group)

								                    return py_container[0][0]


								            except AssertionError:

								                if six.PY2:

								                    warnings.warn("Hickle file is not versioned, attempting legacy loading...")

								                    from . import hickle_legacy

								                    return hickle_legacy.load(fileobj, safe)

								                else:

								                    raise RuntimeError("Cannot open file. This file was likely"

								                                       " created with Python 2 and an old hickle version.")

								    finally:

								        if 'h5f' in locals():

								            # Check if file is open, and if so, close it.

								            if h5f.fid.valid:

								                h5f.close()


								def load_dataset(h_node):

								    """ Load a dataset, converting into its correct python type


								    Args:

								        h_node (h5py dataset): h5py dataset object to read


								    Returns:

								        data: reconstructed python object from loaded data

								    """

								    py_type, data = get_type_and_data(h_node)


								    try:

								        load_fn = load_dataset_lookup(py_type)

								        return load_fn(h_node)

								    except:

								        raise

								        #raise RuntimeError("Hickle type %s not understood." % py_type)


								def _load(py_container, h_group):

								    """ Load a hickle file


								    Recursive funnction to load hdf5 data into a PyContainer()


								    Args:

								        py_container (PyContainer): Python container to load data into

								        h_group (h5 group or dataset): h5py object, group or dataset, to spider

								                and load all datasets.

								    """


								    group_dtype   = h5._hl.group.Group

								    dataset_dtype = h5._hl.dataset.Dataset


								    #either a file, group, or dataset

								    if isinstance(h_group, H5FileWrapper) or isinstance(h_group, group_dtype):


								        py_subcontainer = PyContainer()

								        try:

								            py_subcontainer.container_type = bytes(h_group.attrs['type'][0])

								        except KeyError:

								            raise

								            #py_subcontainer.container_type = ''

								        py_subcontainer.name = h_group.name


								        if py_subcontainer.container_type == b'dict_item':

								            py_subcontainer.key_type = h_group.attrs['key_type']


								        if py_subcontainer.container_type not in types_not_to_sort:

								            h_keys = sort_keys(h_group.keys())

								        else:

								            h_keys = h_group.keys()


								        for h_name in h_keys:

								            h_node = h_group[h_name]

								            py_subcontainer = _load(py_subcontainer, h_node)


								        sub_data = py_subcontainer.convert()

								        py_container.append(sub_data)


								    else:

								        # must be a dataset

								        subdata = load_dataset(h_group)

								        py_container.append(subdata)


								    return py_container