You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

620 lines
21 KiB

4 years ago
  1. # encoding: utf-8
  2. """
  3. # hickle.py
  4. Created by Danny Price 2016-02-03.
  5. Hickle is a HDF5 based clone of Pickle. Instead of serializing to a pickle
  6. file, Hickle dumps to a HDF5 file. It is designed to be as similar to pickle in
  7. usage as possible, providing a load() and dump() function.
  8. ## Notes
  9. Hickle has two main advantages over Pickle:
  10. 1) LARGE PICKLE HANDLING. Unpickling a large pickle is slow, as the Unpickler
  11. reads the entire pickle thing and loads it into memory. In comparison, HDF5
  12. files are designed for large datasets. Things are only loaded when accessed.
  13. 2) CROSS PLATFORM SUPPORT. Attempting to unpickle a pickle pickled on Windows
  14. on Linux and vice versa is likely to fail with errors like "Insecure string
  15. pickle". HDF5 files will load fine, as long as both machines have
  16. h5py installed.
  17. """
  18. from __future__ import absolute_import
  19. import sys
  20. import os
  21. from pkg_resources import get_distribution
  22. import numpy as np
  23. import h5py as h5
  24. from .helpers import get_type_and_data, sort_keys, check_is_iterable, check_iterable_item_type
  25. from .lookup import types_dict, hkl_types_dict, types_not_to_sort, \
  26. container_types_dict, container_key_types_dict
  27. from .lookup import check_is_ndarray_like
  28. try:
  29. from exceptions import Exception
  30. from types import NoneType
  31. except ImportError:
  32. pass # above imports will fail in python3
  33. import six
  34. import io
  35. # Import a default 'pickler'
  36. # Not the nicest import code, but should work on Py2/Py3
  37. try:
  38. import dill as pickle
  39. except ImportError:
  40. try:
  41. import cPickle as pickle
  42. except ImportError:
  43. import pickle
  44. except ModuleNotFoundError:
  45. import pickle
  46. except ModuleNotFoundError:
  47. import pickle
  48. import warnings
  49. from pkg_resources import get_distribution, DistributionNotFound
  50. try:
  51. __version__ = get_distribution('hickle').version
  52. except DistributionNotFound:
  53. __version__ = '0.0.0 - please install via pip/setup.py'
  54. ##################
  55. # Error handling #
  56. ##################
  57. class FileError(Exception):
  58. """ An exception raised if the file is fishy """
  59. def __init__(self):
  60. return
  61. def __str__(self):
  62. return ("Cannot open file. Please pass either a filename "
  63. "string, a file object, or a h5py.File")
  64. class ClosedFileError(Exception):
  65. """ An exception raised if the file is fishy """
  66. def __init__(self):
  67. return
  68. def __str__(self):
  69. return ("HDF5 file has been closed. Please pass either "
  70. "a filename string, a file object, or an open h5py.File")
  71. class NoMatchError(Exception):
  72. """ An exception raised if the object type is not understood (or
  73. supported)"""
  74. def __init__(self):
  75. return
  76. def __str__(self):
  77. return ("Error: this type of python object cannot be converted into a "
  78. "hickle.")
  79. class ToDoError(Exception):
  80. """ An exception raised for non-implemented functionality"""
  81. def __init__(self):
  82. return
  83. def __str__(self):
  84. return "Error: this functionality hasn't been implemented yet."
  85. class SerializedWarning(UserWarning):
  86. """ An object type was not understood
  87. The data will be serialized using pickle.
  88. """
  89. pass
  90. ######################
  91. # H5PY file wrappers #
  92. ######################
  93. class H5GroupWrapper(h5.Group):
  94. """ Group wrapper that provides a track_times kwarg.
  95. track_times is a boolean flag that can be set to False, so that two
  96. files created at different times will have identical MD5 hashes.
  97. """
  98. def create_dataset(self, *args, **kwargs):
  99. kwargs['track_times'] = getattr(self, 'track_times', True)
  100. return super(H5GroupWrapper, self).create_dataset(*args, **kwargs)
  101. def create_group(self, *args, **kwargs):
  102. group = super(H5GroupWrapper, self).create_group(*args, **kwargs)
  103. group.__class__ = H5GroupWrapper
  104. group.track_times = getattr(self, 'track_times', True)
  105. return group
  106. class H5FileWrapper(h5.File):
  107. """ Wrapper for h5py File that provides a track_times kwarg.
  108. track_times is a boolean flag that can be set to False, so that two
  109. files created at different times will have identical MD5 hashes.
  110. """
  111. def create_dataset(self, *args, **kwargs):
  112. kwargs['track_times'] = getattr(self, 'track_times', True)
  113. return super(H5FileWrapper, self).create_dataset(*args, **kwargs)
  114. def create_group(self, *args, **kwargs):
  115. group = super(H5FileWrapper, self).create_group(*args, **kwargs)
  116. group.__class__ = H5GroupWrapper
  117. group.track_times = getattr(self, 'track_times', True)
  118. return group
  119. def file_opener(f, mode='r', track_times=True):
  120. """ A file opener helper function with some error handling. This can open
  121. files through a file object, a h5py file, or just the filename.
  122. Args:
  123. f (file, h5py.File, or string): File-identifier, e.g. filename or file object.
  124. mode (str): File open mode. Only required if opening by filename string.
  125. track_times (bool): Track time in HDF5; turn off if you want hickling at
  126. different times to produce identical files (e.g. for MD5 hash check).
  127. """
  128. # Were we handed a file object or just a file name string?
  129. if six.PY2:
  130. if isinstance(f, file):
  131. filename, mode = f.name, f.mode
  132. f.close()
  133. h5f = h5.File(filename, mode)
  134. elif isinstance(f, str) or isinstance(f, unicode):
  135. filename = f
  136. h5f = h5.File(filename, mode)
  137. elif isinstance(f, H5FileWrapper) or isinstance(f, h5._hl.files.File):
  138. try:
  139. filename = f.filename
  140. except ValueError:
  141. raise ClosedFileError()
  142. h5f = f
  143. else:
  144. print(type(f))
  145. raise FileError
  146. else:
  147. if isinstance(f, io.TextIOWrapper):
  148. filename, mode = f.name, f.mode
  149. f.close()
  150. h5f = h5.File(filename, mode)
  151. elif isinstance(f, str) or isinstance(f, bytes):
  152. filename = f
  153. h5f = h5.File(filename, mode)
  154. elif isinstance(f, H5FileWrapper) or isinstance(f, h5._hl.files.File):
  155. try:
  156. filename = f.filename
  157. except ValueError:
  158. raise ClosedFileError()
  159. h5f = f
  160. else:
  161. print(type(f))
  162. raise FileError
  163. h5f.__class__ = H5FileWrapper
  164. h5f.track_times = track_times
  165. return h5f
  166. ###########
  167. # DUMPERS #
  168. ###########
  169. def _dump(py_obj, h_group, call_id=0, **kwargs):
  170. """ Dump a python object to a group within a HDF5 file.
  171. This function is called recursively by the main dump() function.
  172. Args:
  173. py_obj: python object to dump.
  174. h_group (h5.File.group): group to dump data into.
  175. call_id (int): index to identify object's relative location in the iterable.
  176. """
  177. if six.PY2:
  178. dumpable_dtypes = (bool, int, float, long, complex, str, unicode)
  179. else:
  180. dumpable_dtypes = (bool, int, float, complex, bytes, str)
  181. # Firstly, check if item is a numpy array. If so, just dump it.
  182. if check_is_ndarray_like(py_obj):
  183. create_hkl_dataset(py_obj, h_group, call_id, **kwargs)
  184. # next, check if item is iterable
  185. elif check_is_iterable(py_obj):
  186. item_type = check_iterable_item_type(py_obj)
  187. # item_type == False implies multiple types. Create a dataset
  188. if item_type is False:
  189. h_subgroup = create_hkl_group(py_obj, h_group, call_id)
  190. for ii, py_subobj in enumerate(py_obj):
  191. _dump(py_subobj, h_subgroup, call_id=ii, **kwargs)
  192. # otherwise, subitems have same type. Check if subtype is an iterable
  193. # (e.g. list of lists), or not (e.g. list of ints, which should be treated
  194. # as a single dataset).
  195. else:
  196. if item_type in dumpable_dtypes:
  197. create_hkl_dataset(py_obj, h_group, call_id, **kwargs)
  198. else:
  199. h_subgroup = create_hkl_group(py_obj, h_group, call_id)
  200. for ii, py_subobj in enumerate(py_obj):
  201. _dump(py_subobj, h_subgroup, call_id=ii, **kwargs)
  202. # item is not iterable, so create a dataset for it
  203. else:
  204. create_hkl_dataset(py_obj, h_group, call_id, **kwargs)
  205. def dump(py_obj, file_obj, mode='w', track_times=True, path='/', **kwargs):
  206. """ Write a pickled representation of obj to the open file object file.
  207. Args:
  208. obj (object): python object o store in a Hickle
  209. file: file object, filename string, or h5py.File object
  210. file in which to store the object. A h5py.File or a filename is also
  211. acceptable.
  212. mode (str): optional argument, 'r' (read only), 'w' (write) or 'a' (append).
  213. Ignored if file is a file object.
  214. compression (str): optional argument. Applies compression to dataset. Options: None, gzip,
  215. lzf (+ szip, if installed)
  216. track_times (bool): optional argument. If set to False, repeated hickling will produce
  217. identical files.
  218. path (str): path within hdf5 file to save data to. Defaults to root /
  219. """
  220. try:
  221. # Open the file
  222. h5f = file_opener(file_obj, mode, track_times)
  223. h5f.attrs[b"CLASS"] = b'hickle'
  224. h5f.attrs[b"VERSION"] = get_distribution('hickle').version
  225. h5f.attrs[b"type"] = [b'hickle']
  226. # Log which version of python was used to generate the hickle file
  227. pv = sys.version_info
  228. py_ver = "%i.%i.%i" % (pv[0], pv[1], pv[2])
  229. h5f.attrs[b"PYTHON_VERSION"] = py_ver
  230. h_root_group = h5f.get(path)
  231. if h_root_group is None:
  232. h_root_group = h5f.create_group(path)
  233. h_root_group.attrs[b"type"] = [b'hickle']
  234. _dump(py_obj, h_root_group, **kwargs)
  235. h5f.close()
  236. except NoMatchError:
  237. fname = h5f.filename
  238. h5f.close()
  239. try:
  240. os.remove(fname)
  241. except OSError:
  242. warnings.warn("Dump failed. Could not remove %s" % fname)
  243. finally:
  244. raise NoMatchError
  245. def create_dataset_lookup(py_obj):
  246. """ What type of object are we trying to pickle? This is a python
  247. dictionary based equivalent of a case statement. It returns the correct
  248. helper function for a given data type.
  249. Args:
  250. py_obj: python object to look-up what function to use to dump to disk
  251. Returns:
  252. match: function that should be used to dump data to a new dataset
  253. """
  254. t = type(py_obj)
  255. types_lookup = {dict: create_dict_dataset}
  256. types_lookup.update(types_dict)
  257. match = types_lookup.get(t, no_match)
  258. return match
  259. def create_hkl_dataset(py_obj, h_group, call_id=0, **kwargs):
  260. """ Create a dataset within the hickle HDF5 file
  261. Args:
  262. py_obj: python object to dump.
  263. h_group (h5.File.group): group to dump data into.
  264. call_id (int): index to identify object's relative location in the iterable.
  265. """
  266. #lookup dataset creator type based on python object type
  267. create_dataset = create_dataset_lookup(py_obj)
  268. # do the creation
  269. create_dataset(py_obj, h_group, call_id, **kwargs)
  270. def create_hkl_group(py_obj, h_group, call_id=0):
  271. """ Create a new group within the hickle file
  272. Args:
  273. h_group (h5.File.group): group to dump data into.
  274. call_id (int): index to identify object's relative location in the iterable.
  275. """
  276. h_subgroup = h_group.create_group('data_%i' % call_id)
  277. if six.PY2:
  278. h_subgroup.attrs["type"] = [str(type(py_obj))]
  279. else:
  280. h_subgroup.attrs["type"] = [bytes(str(type(py_obj)), 'ascii')]
  281. return h_subgroup
  282. def create_dict_dataset(py_obj, h_group, call_id=0, **kwargs):
  283. """ Creates a data group for each key in dictionary
  284. Notes:
  285. This is a very important function which uses the recursive _dump
  286. method to build up hierarchical data models stored in the HDF5 file.
  287. As this is critical to functioning, it is kept in the main hickle.py
  288. file instead of in the loaders/ directory.
  289. Args:
  290. py_obj: python object to dump; should be dictionary
  291. h_group (h5.File.group): group to dump data into.
  292. call_id (int): index to identify object's relative location in the iterable.
  293. """
  294. h_dictgroup = h_group.create_group('data_%i' % call_id)
  295. h_dictgroup.attrs["type"] = [b'dict']
  296. for key, py_subobj in py_obj.items():
  297. if six.PY2:
  298. if type(key) in (unicode, str):
  299. h_subgroup = h_dictgroup.create_group(key)
  300. else:
  301. h_subgroup = h_dictgroup.create_group(str(key))
  302. else:
  303. h_subgroup = h_dictgroup.create_group(str(key))
  304. h_subgroup.attrs["type"] = [b'dict_item']
  305. if six.PY2:
  306. h_subgroup.attrs["key_type"] = [str(type(key))]
  307. else:
  308. tk = str(type(key)).encode('utf-8')
  309. h_subgroup.attrs["key_type"] = [tk]
  310. _dump(py_subobj, h_subgroup, call_id=0, **kwargs)
  311. def no_match(py_obj, h_group, call_id=0, **kwargs):
  312. """ If no match is made, raise an exception
  313. Args:
  314. py_obj: python object to dump; default if item is not matched.
  315. h_group (h5.File.group): group to dump data into.
  316. call_id (int): index to identify object's relative location in the iterable.
  317. """
  318. pickled_obj = pickle.dumps(py_obj)
  319. d = h_group.create_dataset('data_%i' % call_id, data=[pickled_obj])
  320. d.attrs["type"] = [b'pickle']
  321. warnings.warn("%s type not understood, data have been serialized" % type(py_obj),
  322. SerializedWarning)
  323. #############
  324. ## LOADERS ##
  325. #############
  326. class PyContainer(list):
  327. """ A group-like object into which to load datasets.
  328. In order to build up a tree-like structure, we need to be able
  329. to load datasets into a container with an append() method.
  330. Python tuples and sets do not allow this. This class provides
  331. a list-like object that be converted into a list, tuple, set or dict.
  332. """
  333. def __init__(self):
  334. super(PyContainer, self).__init__()
  335. self.container_type = None
  336. self.name = None
  337. self.key_type = None
  338. def convert(self):
  339. """ Convert from PyContainer to python core data type.
  340. Returns: self, either as a list, tuple, set or dict
  341. (or other type specified in lookup.py)
  342. """
  343. if self.container_type in container_types_dict.keys():
  344. convert_fn = container_types_dict[self.container_type]
  345. return convert_fn(self)
  346. if self.container_type == b"dict":
  347. keys = []
  348. for item in self:
  349. key = item.name.split('/')[-1]
  350. key_type = item.key_type[0]
  351. if key_type in container_key_types_dict.keys():
  352. to_type_fn = container_key_types_dict[key_type]
  353. key = to_type_fn(key)
  354. keys.append(key)
  355. items = [item[0] for item in self]
  356. return dict(zip(keys, items))
  357. else:
  358. return self
  359. def no_match_load(key):
  360. """ If no match is made when loading, need to raise an exception
  361. """
  362. raise RuntimeError("Cannot load %s data type" % key)
  363. #pass
  364. def load_dataset_lookup(key):
  365. """ What type of object are we trying to unpickle? This is a python
  366. dictionary based equivalent of a case statement. It returns the type
  367. a given 'type' keyword in the hickle file.
  368. Args:
  369. py_obj: python object to look-up what function to use to dump to disk
  370. Returns:
  371. match: function that should be used to dump data to a new dataset
  372. """
  373. match = hkl_types_dict.get(key, no_match_load)
  374. return match
  375. def load(fileobj, path='/', safe=True):
  376. """ Load a hickle file and reconstruct a python object
  377. Args:
  378. fileobj: file object, h5py.File, or filename string
  379. safe (bool): Disable automatic depickling of arbitrary python objects.
  380. DO NOT set this to False unless the file is from a trusted source.
  381. (see http://www.cs.jhu.edu/~s/musings/pickle.html for an explanation)
  382. path (str): path within hdf5 file to save data to. Defaults to root /
  383. """
  384. try:
  385. with file_opener(fileobj) as h5f:
  386. h_root_group = h5f.get(path)
  387. try:
  388. assert 'CLASS' in h5f.attrs.keys()
  389. assert 'VERSION' in h5f.attrs.keys()
  390. VER = h5f.attrs['VERSION']
  391. try:
  392. VER_MAJOR = int(VER)
  393. except ValueError:
  394. VER_MAJOR = int(VER[0])
  395. if VER_MAJOR == 1:
  396. if six.PY2:
  397. warnings.warn("Hickle file versioned as V1, attempting legacy loading...")
  398. from . import hickle_legacy
  399. return hickle_legacy.load(fileobj, safe)
  400. else:
  401. raise RuntimeError("Cannot open file. This file was likely"
  402. " created with Python 2 and an old hickle version.")
  403. elif VER_MAJOR == 2:
  404. if six.PY2:
  405. warnings.warn("Hickle file appears to be old version (v2), attempting "
  406. "legacy loading...")
  407. from . import hickle_legacy2
  408. return hickle_legacy2.load(fileobj, safe=safe)
  409. else:
  410. raise RuntimeError("Cannot open file. This file was likely"
  411. " created with Python 2 and an old hickle version.")
  412. # There is an unfortunate period of time where hickle 2.1.0 claims VERSION = int(3)
  413. # For backward compatibility we really need to catch this.
  414. # Actual hickle v3 files are versioned as A.B.C (e.g. 3.1.0)
  415. elif VER_MAJOR == 3 and VER == VER_MAJOR:
  416. if six.PY2:
  417. warnings.warn("Hickle file appears to be old version (v2.1.0), attempting "
  418. "legacy loading...")
  419. from . import hickle_legacy2
  420. return hickle_legacy2.load(fileobj, safe=safe)
  421. else:
  422. raise RuntimeError("Cannot open file. This file was likely"
  423. " created with Python 2 and an old hickle version.")
  424. elif VER_MAJOR >= 3:
  425. py_container = PyContainer()
  426. py_container.container_type = 'hickle'
  427. py_container = _load(py_container, h_root_group)
  428. return py_container[0][0]
  429. except AssertionError:
  430. if six.PY2:
  431. warnings.warn("Hickle file is not versioned, attempting legacy loading...")
  432. from . import hickle_legacy
  433. return hickle_legacy.load(fileobj, safe)
  434. else:
  435. raise RuntimeError("Cannot open file. This file was likely"
  436. " created with Python 2 and an old hickle version.")
  437. finally:
  438. if 'h5f' in locals():
  439. # Check if file is open, and if so, close it.
  440. if h5f.fid.valid:
  441. h5f.close()
  442. def load_dataset(h_node):
  443. """ Load a dataset, converting into its correct python type
  444. Args:
  445. h_node (h5py dataset): h5py dataset object to read
  446. Returns:
  447. data: reconstructed python object from loaded data
  448. """
  449. py_type, data = get_type_and_data(h_node)
  450. try:
  451. load_fn = load_dataset_lookup(py_type)
  452. return load_fn(h_node)
  453. except:
  454. raise
  455. #raise RuntimeError("Hickle type %s not understood." % py_type)
  456. def _load(py_container, h_group):
  457. """ Load a hickle file
  458. Recursive funnction to load hdf5 data into a PyContainer()
  459. Args:
  460. py_container (PyContainer): Python container to load data into
  461. h_group (h5 group or dataset): h5py object, group or dataset, to spider
  462. and load all datasets.
  463. """
  464. group_dtype = h5._hl.group.Group
  465. dataset_dtype = h5._hl.dataset.Dataset
  466. #either a file, group, or dataset
  467. if isinstance(h_group, H5FileWrapper) or isinstance(h_group, group_dtype):
  468. py_subcontainer = PyContainer()
  469. try:
  470. py_subcontainer.container_type = bytes(h_group.attrs['type'][0])
  471. except KeyError:
  472. raise
  473. #py_subcontainer.container_type = ''
  474. py_subcontainer.name = h_group.name
  475. if py_subcontainer.container_type == b'dict_item':
  476. py_subcontainer.key_type = h_group.attrs['key_type']
  477. if py_subcontainer.container_type not in types_not_to_sort:
  478. h_keys = sort_keys(h_group.keys())
  479. else:
  480. h_keys = h_group.keys()
  481. for h_name in h_keys:
  482. h_node = h_group[h_name]
  483. py_subcontainer = _load(py_subcontainer, h_node)
  484. sub_data = py_subcontainer.convert()
  485. py_container.append(sub_data)
  486. else:
  487. # must be a dataset
  488. subdata = load_dataset(h_group)
  489. py_container.append(subdata)
  490. return py_container