laywerrobot/lib/python3.6/site-packages/gensim/models/deprecated/old_saveload.py

400 lines
15 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2018 Radim Rehurek <me@radimrehurek.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
Warnings
--------
.. deprecated:: 3.3.0
Use :mod:`gensim.utils` instead.
Class containing the old SaveLoad class with modeified `unpickle` function is support loading models saved using
an older gensim version.
"""
from __future__ import with_statement
import logging
try:
import cPickle as _pickle
except ImportError:
import pickle as _pickle
import re
import sys
import numpy as np
import scipy.sparse
from six import iteritems
from smart_open import smart_open
if sys.version_info[0] >= 3:
unicode = str
logger = logging.getLogger(__name__)
PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE)
RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)
class SaveLoad(object):
"""Class which inherit from this class have save/load functions, which un/pickle them to disk.
Warnings
--------
This uses pickle for de/serializing, so objects must not contain unpicklable attributes,
such as lambda functions etc.
"""
@classmethod
def load(cls, fname, mmap=None):
"""Load a previously saved object (using :meth:`~gensim.utils.SaveLoad.save`) from file.
Parameters
----------
fname : str
Path to file that contains needed object.
mmap : str, optional
Memory-map option. If the object was saved with large arrays stored separately, you can load these arrays
via mmap (shared memory) using `mmap='r'.
If the file being loaded is compressed (either '.gz' or '.bz2'), then `mmap=None` **must be** set.
See Also
--------
:meth:`~gensim.utils.SaveLoad.save`
Returns
-------
object
Object loaded from `fname`.
Raises
------
IOError
When methods are called on instance (should be called from class).
"""
logger.info("loading %s object from %s", cls.__name__, fname)
compress, subname = SaveLoad._adapt_by_suffix(fname)
obj = unpickle(fname)
obj._load_specials(fname, mmap, compress, subname)
logger.info("loaded %s", fname)
return obj
def _load_specials(self, fname, mmap, compress, subname):
"""Loads any attributes that were stored specially, and gives the same opportunity
to recursively included :class:`~gensim.utils.SaveLoad` instances.
Parameters
----------
fname : str
Path to file that contains needed object.
mmap : str
Memory-map option.
compress : bool
Set to True if file is compressed.
subname : str
...
"""
def mmap_error(obj, filename):
return IOError(
'Cannot mmap compressed object %s in file %s. ' % (obj, filename) +
'Use `load(fname, mmap=None)` or uncompress files manually.'
)
for attrib in getattr(self, '__recursive_saveloads', []):
cfname = '.'.join((fname, attrib))
logger.info("loading %s recursively from %s.* with mmap=%s", attrib, cfname, mmap)
getattr(self, attrib)._load_specials(cfname, mmap, compress, subname)
for attrib in getattr(self, '__numpys', []):
logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap)
if compress:
if mmap:
raise mmap_error(attrib, subname(fname, attrib))
val = np.load(subname(fname, attrib))['val']
else:
val = np.load(subname(fname, attrib), mmap_mode=mmap)
setattr(self, attrib, val)
for attrib in getattr(self, '__scipys', []):
logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap)
sparse = unpickle(subname(fname, attrib))
if compress:
if mmap:
raise mmap_error(attrib, subname(fname, attrib))
with np.load(subname(fname, attrib, 'sparse')) as f:
sparse.data = f['data']
sparse.indptr = f['indptr']
sparse.indices = f['indices']
else:
sparse.data = np.load(subname(fname, attrib, 'data'), mmap_mode=mmap)
sparse.indptr = np.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap)
sparse.indices = np.load(subname(fname, attrib, 'indices'), mmap_mode=mmap)
setattr(self, attrib, sparse)
for attrib in getattr(self, '__ignoreds', []):
logger.info("setting ignored attribute %s to None", attrib)
setattr(self, attrib, None)
@staticmethod
def _adapt_by_suffix(fname):
"""Give appropriate compress setting and filename formula.
Parameters
----------
fname : str
Input filename.
Returns
-------
(bool, function)
First argument will be True if `fname` compressed.
"""
compress, suffix = (True, 'npz') if fname.endswith('.gz') or fname.endswith('.bz2') else (False, 'npy')
return compress, lambda *args: '.'.join(args + (suffix,))
def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2):
"""Save the object to file.
Parameters
----------
fname : str
Path to file.
separately : list, optional
Iterable of attributes than need to store distinctly.
sep_limit : int, optional
Limit for separation.
ignore : frozenset, optional
Attributes that shouldn't be store.
pickle_protocol : int, optional
Protocol number for pickle.
Notes
-----
If `separately` is None, automatically detect large
numpy/scipy.sparse arrays in the object being stored, and store
them into separate files. This avoids pickle memory errors and
allows mmap'ing large arrays back on load efficiently.
You can also set `separately` manually, in which case it must be
a list of attribute names to be stored in separate files. The
automatic check is not performed in this case.
See Also
--------
:meth:`~gensim.utils.SaveLoad.load`
"""
logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately)
compress, subname = SaveLoad._adapt_by_suffix(fname)
restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol,
compress, subname)
try:
pickle(self, fname, protocol=pickle_protocol)
finally:
# restore attribs handled specially
for obj, asides in restores:
for attrib, val in iteritems(asides):
setattr(obj, attrib, val)
logger.info("saved %s", fname)
def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname):
"""Save aside any attributes that need to be handled separately, including
by recursion any attributes that are themselves :class:`~gensim.utils.SaveLoad` instances.
Parameters
----------
fname : str
Output filename.
separately : list or None
Iterable of attributes than need to store distinctly
sep_limit : int
Limit for separation.
ignore : iterable of str
Attributes that shouldn't be store.
pickle_protocol : int
Protocol number for pickle.
compress : bool
If True - compress output with :func:`numpy.savez_compressed`.
subname : function
Produced by :meth:`~gensim.utils.SaveLoad._adapt_by_suffix`
Returns
-------
list of (obj, {attrib: value, ...})
Settings that the caller should use to restore each object's attributes that were set aside
during the default :func:`~gensim.utils.pickle`.
"""
asides = {}
sparse_matrices = (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)
if separately is None:
separately = []
for attrib, val in iteritems(self.__dict__):
if isinstance(val, np.ndarray) and val.size >= sep_limit:
separately.append(attrib)
elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit:
separately.append(attrib)
# whatever's in `separately` or `ignore` at this point won't get pickled
for attrib in separately + list(ignore):
if hasattr(self, attrib):
asides[attrib] = getattr(self, attrib)
delattr(self, attrib)
recursive_saveloads = []
restores = []
for attrib, val in iteritems(self.__dict__):
if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading
recursive_saveloads.append(attrib)
cfname = '.'.join((fname, attrib))
restores.extend(val._save_specials(cfname, None, sep_limit, ignore, pickle_protocol, compress, subname))
try:
numpys, scipys, ignoreds = [], [], []
for attrib, val in iteritems(asides):
if isinstance(val, np.ndarray) and attrib not in ignore:
numpys.append(attrib)
logger.info("storing np array '%s' to %s", attrib, subname(fname, attrib))
if compress:
np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val))
else:
np.save(subname(fname, attrib), np.ascontiguousarray(val))
elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore:
scipys.append(attrib)
logger.info("storing scipy.sparse array '%s' under %s", attrib, subname(fname, attrib))
if compress:
np.savez_compressed(
subname(fname, attrib, 'sparse'),
data=val.data,
indptr=val.indptr,
indices=val.indices
)
else:
np.save(subname(fname, attrib, 'data'), val.data)
np.save(subname(fname, attrib, 'indptr'), val.indptr)
np.save(subname(fname, attrib, 'indices'), val.indices)
data, indptr, indices = val.data, val.indptr, val.indices
val.data, val.indptr, val.indices = None, None, None
try:
# store array-less object
pickle(val, subname(fname, attrib), protocol=pickle_protocol)
finally:
val.data, val.indptr, val.indices = data, indptr, indices
else:
logger.info("not storing attribute %s", attrib)
ignoreds.append(attrib)
self.__dict__['__numpys'] = numpys
self.__dict__['__scipys'] = scipys
self.__dict__['__ignoreds'] = ignoreds
self.__dict__['__recursive_saveloads'] = recursive_saveloads
except Exception:
# restore the attributes if exception-interrupted
for attrib, val in iteritems(asides):
setattr(self, attrib, val)
raise
return restores + [(self, asides)]
def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2):
"""Save the object to file.
Parameters
----------
fname_or_handle : str or file-like
Path to output file or already opened file-like object. If the object is a file handle,
no special array handling will be performed, all attributes will be saved to the same file.
separately : list of str or None, optional
If None - automatically detect large numpy/scipy.sparse arrays in the object being stored, and store
them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays
back on load efficiently.
If list of str - this attributes will be stored in separate files, the automatic check
is not performed in this case.
sep_limit : int
Limit for automatic separation.
ignore : frozenset of str
Attributes that shouldn't be serialize/store.
pickle_protocol : int
Protocol number for pickle.
See Also
--------
:meth:`~gensim.utils.SaveLoad.load`
"""
try:
_pickle.dump(self, fname_or_handle, protocol=pickle_protocol)
logger.info("saved %s object", self.__class__.__name__)
except TypeError: # `fname_or_handle` does not have write attribute
self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol)
def unpickle(fname):
"""Load object from `fname`.
Parameters
----------
fname : str
Path to pickle file.
Returns
-------
object
Python object loaded from `fname`.
"""
with smart_open(fname, 'rb') as f:
# Because of loading from S3 load can't be used (missing readline in smart_open)
file_bytes = f.read()
file_bytes = file_bytes.replace(b'gensim.models.word2vec', b'gensim.models.deprecated.word2vec')
file_bytes = file_bytes.replace(b'gensim.models.keyedvectors', b'gensim.models.deprecated.keyedvectors')
file_bytes = file_bytes.replace(b'gensim.models.doc2vec', b'gensim.models.deprecated.doc2vec')
file_bytes = file_bytes.replace(b'gensim.models.fasttext', b'gensim.models.deprecated.fasttext')
file_bytes = file_bytes.replace(
b'gensim.models.wrappers.fasttext', b'gensim.models.deprecated.fasttext_wrapper')
if sys.version_info > (3, 0):
return _pickle.loads(file_bytes, encoding='latin1')
else:
return _pickle.loads(file_bytes)
def pickle(obj, fname, protocol=2):
"""Pickle object `obj` to file `fname`.
Parameters
----------
obj : object
Any python object.
fname : str
Path to pickle file.
protocol : int, optional
Pickle protocol number, default is 2 to support compatible across python 2.x and 3.x.
"""
with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows
_pickle.dump(obj, fout, protocol=protocol)