# encoding: utf-8
"""
# hickle.py

Created by Danny Price 2016-02-03.

Hickle is a HDF5 based clone of Pickle. Instead of serializing to a pickle
file, Hickle dumps to a HDF5 file. It is designed to be as similar to pickle in
usage as possible, providing a load() and dump() function.

## Notes

Hickle has two main advantages over Pickle:
1) LARGE PICKLE HANDLING. Unpickling a large pickle is slow, as the Unpickler
reads the entire pickle thing and loads it into memory. In comparison, HDF5
files are designed for large datasets. Things are only loaded when accessed.

2) CROSS PLATFORM SUPPORT. Attempting to unpickle a pickle pickled on Windows
on Linux and vice versa is likely to fail with errors like "Insecure string
pickle". HDF5 files will load fine, as long as both machines have
h5py installed.

"""

from __future__ import absolute_import
import sys
import os
from pkg_resources import get_distribution

import numpy as np
import h5py as h5


from .helpers import get_type_and_data, sort_keys, check_is_iterable, check_iterable_item_type
from .lookup import types_dict, hkl_types_dict, types_not_to_sort, \
    container_types_dict, container_key_types_dict
from .lookup import check_is_ndarray_like


try:
    from exceptions import Exception
    from types import NoneType
except ImportError:
    pass        # above imports will fail in python3

import six
import io

# Import a default 'pickler'
# Not the nicest import code, but should work on Py2/Py3
try:
    import dill as pickle
except ImportError:
    try:
        import cPickle as pickle
    except ImportError:
        import pickle
    except ModuleNotFoundError:
        import pickle
except ModuleNotFoundError:
    import pickle

import warnings

from pkg_resources import get_distribution, DistributionNotFound
try:
    __version__ = get_distribution('hickle').version
except DistributionNotFound:
    __version__ = '0.0.0 - please install via pip/setup.py'

##################
# Error handling #
##################

class FileError(Exception):
    """ An exception raised if the file is fishy """
    def __init__(self):
        return

    def __str__(self):
        return ("Cannot open file. Please pass either a filename "
                "string, a file object, or a h5py.File")


class ClosedFileError(Exception):
    """ An exception raised if the file is fishy """
    def __init__(self):
        return

    def __str__(self):
        return ("HDF5 file has been closed. Please pass either "
                "a filename string, a file object, or an open h5py.File")


class NoMatchError(Exception):
    """ An exception raised if the object type is not understood (or
    supported)"""
    def __init__(self):
        return

    def __str__(self):
        return ("Error: this type of python object cannot be converted into a "
                "hickle.")


class ToDoError(Exception):
    """ An exception raised for non-implemented functionality"""
    def __init__(self):
        return

    def __str__(self):
        return "Error: this functionality hasn't been implemented yet."


class SerializedWarning(UserWarning):
    """ An object type was not understood

    The data will be serialized using pickle.
    """
    pass


######################
# H5PY file wrappers #
######################

class H5GroupWrapper(h5.Group):
    """ Group wrapper that provides a track_times kwarg.

    track_times is a boolean flag that can be set to False, so that two
    files created at different times will have identical MD5 hashes.
    """
    def create_dataset(self, *args, **kwargs):
        kwargs['track_times'] = getattr(self, 'track_times', True)
        return super(H5GroupWrapper, self).create_dataset(*args, **kwargs)

    def create_group(self, *args, **kwargs):
        group = super(H5GroupWrapper, self).create_group(*args, **kwargs)
        group.__class__ = H5GroupWrapper
        group.track_times = getattr(self, 'track_times', True)
        return group


class H5FileWrapper(h5.File):
    """ Wrapper for h5py File that provides a track_times kwarg.

    track_times is a boolean flag that can be set to False, so that two
    files created at different times will have identical MD5 hashes.
    """
    def create_dataset(self, *args, **kwargs):
        kwargs['track_times'] = getattr(self, 'track_times', True)
        return super(H5FileWrapper, self).create_dataset(*args, **kwargs)

    def create_group(self, *args, **kwargs):
        group = super(H5FileWrapper, self).create_group(*args, **kwargs)
        group.__class__ = H5GroupWrapper
        group.track_times = getattr(self, 'track_times', True)
        return group


def file_opener(f, mode='r', track_times=True):
    """ A file opener helper function with some error handling.  This can open
    files through a file object, a h5py file, or just the filename.

    Args:
        f (file, h5py.File, or string): File-identifier, e.g. filename or file object.
        mode (str): File open mode. Only required if opening by filename string.
        track_times (bool): Track time in HDF5; turn off if you want hickling at
                 different times to produce identical files (e.g. for MD5 hash check).

    """

    # Were we handed a file object or just a file name string?
    if six.PY2:
        if isinstance(f, file):
            filename, mode = f.name, f.mode
            f.close()
            h5f = h5.File(filename, mode)
        elif isinstance(f, str) or isinstance(f, unicode):
            filename = f
            h5f = h5.File(filename, mode)
        elif isinstance(f, H5FileWrapper) or isinstance(f, h5._hl.files.File):
            try:
                filename = f.filename
            except ValueError:
                raise ClosedFileError()
            h5f = f
        else:
            print(type(f))
            raise FileError

    else:
        if isinstance(f, io.TextIOWrapper):
            filename, mode = f.name, f.mode
            f.close()
            h5f = h5.File(filename, mode)
        elif isinstance(f, str) or isinstance(f, bytes):
            filename = f
            h5f = h5.File(filename, mode)
        elif isinstance(f, H5FileWrapper) or isinstance(f, h5._hl.files.File):
            try:
                filename = f.filename
            except ValueError:
                raise ClosedFileError()
            h5f = f
        else:
            print(type(f))
            raise FileError


    h5f.__class__ = H5FileWrapper
    h5f.track_times = track_times
    return h5f


###########
# DUMPERS #
###########


def _dump(py_obj, h_group, call_id=0, **kwargs):
    """ Dump a python object to a group within a HDF5 file.

    This function is called recursively by the main dump() function.

    Args:
        py_obj: python object to dump.
        h_group (h5.File.group): group to dump data into.
        call_id (int): index to identify object's relative location in the iterable.
    """

    if six.PY2:
        dumpable_dtypes = (bool, int, float, long, complex, str, unicode)
    else:
        dumpable_dtypes = (bool, int, float, complex, bytes, str)

    # Firstly, check if item is a numpy array. If so, just dump it.
    if check_is_ndarray_like(py_obj):
        create_hkl_dataset(py_obj, h_group, call_id, **kwargs)

    # next, check if item is iterable
    elif check_is_iterable(py_obj):
        item_type = check_iterable_item_type(py_obj)

        # item_type == False implies multiple types. Create a dataset
        if item_type is False:
            h_subgroup = create_hkl_group(py_obj, h_group, call_id)
            for ii, py_subobj in enumerate(py_obj):
                _dump(py_subobj, h_subgroup, call_id=ii, **kwargs)

        # otherwise, subitems have same type. Check if subtype is an iterable
        # (e.g. list of lists), or not (e.g. list of ints, which should be treated
        # as a single dataset).
        else:
            if item_type in dumpable_dtypes:
                create_hkl_dataset(py_obj, h_group, call_id, **kwargs)
            else:
                h_subgroup = create_hkl_group(py_obj, h_group, call_id)
                for ii, py_subobj in enumerate(py_obj):
                    _dump(py_subobj, h_subgroup, call_id=ii, **kwargs)

    # item is not iterable, so create a dataset for it
    else:
        create_hkl_dataset(py_obj, h_group, call_id, **kwargs)


def dump(py_obj, file_obj, mode='w', track_times=True, path='/', **kwargs):
    """ Write a pickled representation of obj to the open file object file.

    Args:
    obj (object): python object o store in a Hickle
    file: file object, filename string, or h5py.File object
            file in which to store the object. A h5py.File or a filename is also
            acceptable.
    mode (str): optional argument, 'r' (read only), 'w' (write) or 'a' (append).
            Ignored if file is a file object.
    compression (str): optional argument. Applies compression to dataset. Options: None, gzip,
            lzf (+ szip, if installed)
    track_times (bool): optional argument. If set to False, repeated hickling will produce
            identical files.
    path (str): path within hdf5 file to save data to. Defaults to root /
    """

    try:
        # Open the file
        h5f = file_opener(file_obj, mode, track_times)
        h5f.attrs[b"CLASS"] = b'hickle'
        h5f.attrs[b"VERSION"] = get_distribution('hickle').version
        h5f.attrs[b"type"] = [b'hickle']
        # Log which version of python was used to generate the hickle file
        pv = sys.version_info
        py_ver = "%i.%i.%i" % (pv[0], pv[1], pv[2])
        h5f.attrs[b"PYTHON_VERSION"] = py_ver

        h_root_group = h5f.get(path)

        if h_root_group is None:
            h_root_group = h5f.create_group(path)
            h_root_group.attrs[b"type"] = [b'hickle']

        _dump(py_obj, h_root_group, **kwargs)
        h5f.close()
    except NoMatchError:
        fname = h5f.filename
        h5f.close()
        try:
            os.remove(fname)
        except OSError:
            warnings.warn("Dump failed. Could not remove %s" % fname)
        finally:
            raise NoMatchError


def create_dataset_lookup(py_obj):
    """ What type of object are we trying to pickle?  This is a python
    dictionary based equivalent of a case statement.  It returns the correct
    helper function for a given data type.

    Args:
        py_obj: python object to look-up what function to use to dump to disk

    Returns:
        match: function that should be used to dump data to a new dataset
    """
    t = type(py_obj)
    types_lookup = {dict: create_dict_dataset}
    types_lookup.update(types_dict)

    match = types_lookup.get(t, no_match)

    return match


def create_hkl_dataset(py_obj, h_group, call_id=0, **kwargs):
    """ Create a dataset within the hickle HDF5 file

    Args:
        py_obj: python object to dump.
        h_group (h5.File.group): group to dump data into.
        call_id (int): index to identify object's relative location in the iterable.

    """
    #lookup dataset creator type based on python object type
    create_dataset = create_dataset_lookup(py_obj)

    # do the creation
    create_dataset(py_obj, h_group, call_id, **kwargs)


def create_hkl_group(py_obj, h_group, call_id=0):
    """ Create a new group within the hickle file

    Args:
        h_group (h5.File.group): group to dump data into.
        call_id (int): index to identify object's relative location in the iterable.

    """
    h_subgroup = h_group.create_group('data_%i' % call_id)
    if six.PY2:
        h_subgroup.attrs["type"] = [str(type(py_obj))]
    else:
        h_subgroup.attrs["type"] = [bytes(str(type(py_obj)), 'ascii')]
    return h_subgroup


def create_dict_dataset(py_obj, h_group, call_id=0, **kwargs):
    """ Creates a data group for each key in dictionary

    Notes:
        This is a very important function which uses the recursive _dump
        method to build up hierarchical data models stored in the HDF5 file.
        As this is critical to functioning, it is kept in the main hickle.py
        file instead of in the loaders/ directory.

    Args:
        py_obj: python object to dump; should be dictionary
        h_group (h5.File.group): group to dump data into.
        call_id (int): index to identify object's relative location in the iterable.
    """
    h_dictgroup = h_group.create_group('data_%i' % call_id)
    h_dictgroup.attrs["type"] = [b'dict']

    for key, py_subobj in py_obj.items():
        if six.PY2:
            if type(key) in (unicode, str):
                h_subgroup = h_dictgroup.create_group(key)
            else:
                h_subgroup = h_dictgroup.create_group(str(key))
        else:
            h_subgroup = h_dictgroup.create_group(str(key))
        h_subgroup.attrs["type"] = [b'dict_item']

        if six.PY2:
            h_subgroup.attrs["key_type"] = [str(type(key))]
        else:
            tk = str(type(key)).encode('utf-8')
            h_subgroup.attrs["key_type"] = [tk]

        _dump(py_subobj, h_subgroup, call_id=0, **kwargs)


def no_match(py_obj, h_group, call_id=0, **kwargs):
    """ If no match is made, raise an exception

    Args:
        py_obj: python object to dump; default if item is not matched.
        h_group (h5.File.group): group to dump data into.
        call_id (int): index to identify object's relative location in the iterable.
    """
    pickled_obj = pickle.dumps(py_obj)
    d = h_group.create_dataset('data_%i' % call_id, data=[pickled_obj])
    d.attrs["type"] = [b'pickle']

    warnings.warn("%s type not understood, data have been serialized" % type(py_obj),
                  SerializedWarning)


#############
## LOADERS ##
#############

class PyContainer(list):
    """ A group-like object into which to load datasets.

    In order to build up a tree-like structure, we need to be able
    to load datasets into a container with an append() method.
    Python tuples and sets do not allow this. This class provides
    a list-like object that be converted into a list, tuple, set or dict.
    """
    def __init__(self):
        super(PyContainer, self).__init__()
        self.container_type = None
        self.name = None
        self.key_type = None

    def convert(self):
        """ Convert from PyContainer to python core data type.

        Returns: self, either as a list, tuple, set or dict
                 (or other type specified in lookup.py)
        """

        if self.container_type in container_types_dict.keys():
            convert_fn = container_types_dict[self.container_type]
            return convert_fn(self)
        if self.container_type == b"dict":
            keys = []
            for item in self:
                key = item.name.split('/')[-1]
                key_type = item.key_type[0]
                if key_type in container_key_types_dict.keys():
                    to_type_fn = container_key_types_dict[key_type]
                    key = to_type_fn(key)
                keys.append(key)

            items = [item[0] for item in self]
            return dict(zip(keys, items))
        else:
            return self

def no_match_load(key):
    """ If no match is made when loading, need to raise an exception
    """
    raise RuntimeError("Cannot load %s data type" % key)
    #pass

def load_dataset_lookup(key):
    """ What type of object are we trying to unpickle?  This is a python
    dictionary based equivalent of a case statement.  It returns the type
    a given 'type' keyword in the hickle file.

    Args:
        py_obj: python object to look-up what function to use to dump to disk

    Returns:
        match: function that should be used to dump data to a new dataset
    """

    match = hkl_types_dict.get(key, no_match_load)

    return match

def load(fileobj, path='/', safe=True):
    """ Load a hickle file and reconstruct a python object

    Args:
        fileobj: file object, h5py.File, or filename string
            safe (bool): Disable automatic depickling of arbitrary python objects.
            DO NOT set this to False unless the file is from a trusted source.
            (see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation)

        path (str): path within hdf5 file to save data to. Defaults to root /
    """

    try:
        with file_opener(fileobj) as h5f:
            h_root_group = h5f.get(path)
            try:
                assert 'CLASS' in h5f.attrs.keys()
                assert 'VERSION' in h5f.attrs.keys()
                VER = h5f.attrs['VERSION']
                try:
                    VER_MAJOR = int(VER)
                except ValueError:
                    VER_MAJOR = int(VER[0])
                if VER_MAJOR == 1:
                    if six.PY2:
                        warnings.warn("Hickle file versioned as V1, attempting legacy loading...")
                        from . import hickle_legacy
                        return hickle_legacy.load(fileobj, safe)
                    else:
                        raise RuntimeError("Cannot open file. This file was likely"
                                           " created with Python 2 and an old hickle version.")
                elif VER_MAJOR == 2:
                    if six.PY2:
                        warnings.warn("Hickle file appears to be old version (v2), attempting "
                                      "legacy loading...")
                        from . import hickle_legacy2
                        return hickle_legacy2.load(fileobj, safe=safe)
                    else:
                        raise RuntimeError("Cannot open file. This file was likely"
                                           " created with Python 2 and an old hickle version.")
                # There is an unfortunate period of time where hickle 2.1.0 claims VERSION = int(3)
                # For backward compatibility we really need to catch this.
                # Actual hickle v3 files are versioned as A.B.C (e.g. 3.1.0)
                elif VER_MAJOR == 3 and VER == VER_MAJOR:
                    if six.PY2:
                        warnings.warn("Hickle file appears to be old version (v2.1.0), attempting "
                                      "legacy loading...")
                        from . import hickle_legacy2
                        return hickle_legacy2.load(fileobj, safe=safe)
                    else:
                        raise RuntimeError("Cannot open file. This file was likely"
                                           " created with Python 2 and an old hickle version.")
                elif VER_MAJOR >= 3:
                    py_container = PyContainer()
                    py_container.container_type = 'hickle'
                    py_container = _load(py_container, h_root_group)
                    return py_container[0][0]

            except AssertionError:
                if six.PY2:
                    warnings.warn("Hickle file is not versioned, attempting legacy loading...")
                    from . import hickle_legacy
                    return hickle_legacy.load(fileobj, safe)
                else:
                    raise RuntimeError("Cannot open file. This file was likely"
                                       " created with Python 2 and an old hickle version.")
    finally:
        if 'h5f' in locals():
            # Check if file is open, and if so, close it.
            if h5f.fid.valid:
                h5f.close()

def load_dataset(h_node):
    """ Load a dataset, converting into its correct python type

    Args:
        h_node (h5py dataset): h5py dataset object to read

    Returns:
        data: reconstructed python object from loaded data
    """
    py_type, data = get_type_and_data(h_node)

    try:
        load_fn = load_dataset_lookup(py_type)
        return load_fn(h_node)
    except:
        raise
        #raise RuntimeError("Hickle type %s not understood." % py_type)

def _load(py_container, h_group):
    """ Load a hickle file

    Recursive funnction to load hdf5 data into a PyContainer()

    Args:
        py_container (PyContainer): Python container to load data into
        h_group (h5 group or dataset): h5py object, group or dataset, to spider
                and load all datasets.
    """

    group_dtype   = h5._hl.group.Group
    dataset_dtype = h5._hl.dataset.Dataset

    #either a file, group, or dataset
    if isinstance(h_group, H5FileWrapper) or isinstance(h_group, group_dtype):

        py_subcontainer = PyContainer()
        try:
            py_subcontainer.container_type = bytes(h_group.attrs['type'][0])
        except KeyError:
            raise
            #py_subcontainer.container_type = ''
        py_subcontainer.name = h_group.name

        if py_subcontainer.container_type == b'dict_item':
            py_subcontainer.key_type = h_group.attrs['key_type']

        if py_subcontainer.container_type not in types_not_to_sort:
            h_keys = sort_keys(h_group.keys())
        else:
            h_keys = h_group.keys()

        for h_name in h_keys:
            h_node = h_group[h_name]
            py_subcontainer = _load(py_subcontainer, h_node)

        sub_data = py_subcontainer.convert()
        py_container.append(sub_data)

    else:
        # must be a dataset
        subdata = load_dataset(h_group)
        py_container.append(subdata)

    return py_container