399 lines
15 KiB
Python
399 lines
15 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright (C) 2018 Radim Rehurek <me@radimrehurek.com>
|
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
|
"""
|
|
Warnings
|
|
--------
|
|
.. deprecated:: 3.3.0
|
|
Use :mod:`gensim.utils` instead.
|
|
|
|
|
|
Class containing the old SaveLoad class with modeified `unpickle` function is support loading models saved using
|
|
an older gensim version.
|
|
|
|
"""
|
|
from __future__ import with_statement
|
|
|
|
import logging
|
|
|
|
try:
|
|
import cPickle as _pickle
|
|
except ImportError:
|
|
import pickle as _pickle
|
|
|
|
import re
|
|
import sys
|
|
|
|
import numpy as np
|
|
import scipy.sparse
|
|
|
|
from six import iteritems
|
|
|
|
from smart_open import smart_open
|
|
|
|
if sys.version_info[0] >= 3:
|
|
unicode = str
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE)
|
|
RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)
|
|
|
|
|
|
class SaveLoad(object):
|
|
"""Class which inherit from this class have save/load functions, which un/pickle them to disk.
|
|
|
|
Warnings
|
|
--------
|
|
This uses pickle for de/serializing, so objects must not contain unpicklable attributes,
|
|
such as lambda functions etc.
|
|
|
|
"""
|
|
@classmethod
|
|
def load(cls, fname, mmap=None):
|
|
"""Load a previously saved object (using :meth:`~gensim.utils.SaveLoad.save`) from file.
|
|
|
|
Parameters
|
|
----------
|
|
fname : str
|
|
Path to file that contains needed object.
|
|
mmap : str, optional
|
|
Memory-map option. If the object was saved with large arrays stored separately, you can load these arrays
|
|
via mmap (shared memory) using `mmap='r'.
|
|
If the file being loaded is compressed (either '.gz' or '.bz2'), then `mmap=None` **must be** set.
|
|
|
|
See Also
|
|
--------
|
|
:meth:`~gensim.utils.SaveLoad.save`
|
|
|
|
Returns
|
|
-------
|
|
object
|
|
Object loaded from `fname`.
|
|
|
|
Raises
|
|
------
|
|
IOError
|
|
When methods are called on instance (should be called from class).
|
|
|
|
"""
|
|
logger.info("loading %s object from %s", cls.__name__, fname)
|
|
|
|
compress, subname = SaveLoad._adapt_by_suffix(fname)
|
|
|
|
obj = unpickle(fname)
|
|
obj._load_specials(fname, mmap, compress, subname)
|
|
logger.info("loaded %s", fname)
|
|
return obj
|
|
|
|
def _load_specials(self, fname, mmap, compress, subname):
|
|
"""Loads any attributes that were stored specially, and gives the same opportunity
|
|
to recursively included :class:`~gensim.utils.SaveLoad` instances.
|
|
|
|
Parameters
|
|
----------
|
|
fname : str
|
|
Path to file that contains needed object.
|
|
mmap : str
|
|
Memory-map option.
|
|
compress : bool
|
|
Set to True if file is compressed.
|
|
subname : str
|
|
...
|
|
|
|
|
|
"""
|
|
def mmap_error(obj, filename):
|
|
return IOError(
|
|
'Cannot mmap compressed object %s in file %s. ' % (obj, filename) +
|
|
'Use `load(fname, mmap=None)` or uncompress files manually.'
|
|
)
|
|
|
|
for attrib in getattr(self, '__recursive_saveloads', []):
|
|
cfname = '.'.join((fname, attrib))
|
|
logger.info("loading %s recursively from %s.* with mmap=%s", attrib, cfname, mmap)
|
|
getattr(self, attrib)._load_specials(cfname, mmap, compress, subname)
|
|
|
|
for attrib in getattr(self, '__numpys', []):
|
|
logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap)
|
|
|
|
if compress:
|
|
if mmap:
|
|
raise mmap_error(attrib, subname(fname, attrib))
|
|
|
|
val = np.load(subname(fname, attrib))['val']
|
|
else:
|
|
val = np.load(subname(fname, attrib), mmap_mode=mmap)
|
|
|
|
setattr(self, attrib, val)
|
|
|
|
for attrib in getattr(self, '__scipys', []):
|
|
logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap)
|
|
sparse = unpickle(subname(fname, attrib))
|
|
if compress:
|
|
if mmap:
|
|
raise mmap_error(attrib, subname(fname, attrib))
|
|
|
|
with np.load(subname(fname, attrib, 'sparse')) as f:
|
|
sparse.data = f['data']
|
|
sparse.indptr = f['indptr']
|
|
sparse.indices = f['indices']
|
|
else:
|
|
sparse.data = np.load(subname(fname, attrib, 'data'), mmap_mode=mmap)
|
|
sparse.indptr = np.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap)
|
|
sparse.indices = np.load(subname(fname, attrib, 'indices'), mmap_mode=mmap)
|
|
|
|
setattr(self, attrib, sparse)
|
|
|
|
for attrib in getattr(self, '__ignoreds', []):
|
|
logger.info("setting ignored attribute %s to None", attrib)
|
|
setattr(self, attrib, None)
|
|
|
|
@staticmethod
|
|
def _adapt_by_suffix(fname):
|
|
"""Give appropriate compress setting and filename formula.
|
|
|
|
Parameters
|
|
----------
|
|
fname : str
|
|
Input filename.
|
|
|
|
Returns
|
|
-------
|
|
(bool, function)
|
|
First argument will be True if `fname` compressed.
|
|
|
|
"""
|
|
compress, suffix = (True, 'npz') if fname.endswith('.gz') or fname.endswith('.bz2') else (False, 'npy')
|
|
return compress, lambda *args: '.'.join(args + (suffix,))
|
|
|
|
def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2):
|
|
"""Save the object to file.
|
|
|
|
Parameters
|
|
----------
|
|
fname : str
|
|
Path to file.
|
|
separately : list, optional
|
|
Iterable of attributes than need to store distinctly.
|
|
sep_limit : int, optional
|
|
Limit for separation.
|
|
ignore : frozenset, optional
|
|
Attributes that shouldn't be store.
|
|
pickle_protocol : int, optional
|
|
Protocol number for pickle.
|
|
|
|
Notes
|
|
-----
|
|
If `separately` is None, automatically detect large
|
|
numpy/scipy.sparse arrays in the object being stored, and store
|
|
them into separate files. This avoids pickle memory errors and
|
|
allows mmap'ing large arrays back on load efficiently.
|
|
|
|
You can also set `separately` manually, in which case it must be
|
|
a list of attribute names to be stored in separate files. The
|
|
automatic check is not performed in this case.
|
|
|
|
See Also
|
|
--------
|
|
:meth:`~gensim.utils.SaveLoad.load`
|
|
|
|
"""
|
|
logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately)
|
|
|
|
compress, subname = SaveLoad._adapt_by_suffix(fname)
|
|
|
|
restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol,
|
|
compress, subname)
|
|
try:
|
|
pickle(self, fname, protocol=pickle_protocol)
|
|
finally:
|
|
# restore attribs handled specially
|
|
for obj, asides in restores:
|
|
for attrib, val in iteritems(asides):
|
|
setattr(obj, attrib, val)
|
|
logger.info("saved %s", fname)
|
|
|
|
def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname):
|
|
"""Save aside any attributes that need to be handled separately, including
|
|
by recursion any attributes that are themselves :class:`~gensim.utils.SaveLoad` instances.
|
|
|
|
Parameters
|
|
----------
|
|
fname : str
|
|
Output filename.
|
|
separately : list or None
|
|
Iterable of attributes than need to store distinctly
|
|
sep_limit : int
|
|
Limit for separation.
|
|
ignore : iterable of str
|
|
Attributes that shouldn't be store.
|
|
pickle_protocol : int
|
|
Protocol number for pickle.
|
|
compress : bool
|
|
If True - compress output with :func:`numpy.savez_compressed`.
|
|
subname : function
|
|
Produced by :meth:`~gensim.utils.SaveLoad._adapt_by_suffix`
|
|
|
|
Returns
|
|
-------
|
|
list of (obj, {attrib: value, ...})
|
|
Settings that the caller should use to restore each object's attributes that were set aside
|
|
during the default :func:`~gensim.utils.pickle`.
|
|
|
|
"""
|
|
asides = {}
|
|
sparse_matrices = (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)
|
|
if separately is None:
|
|
separately = []
|
|
for attrib, val in iteritems(self.__dict__):
|
|
if isinstance(val, np.ndarray) and val.size >= sep_limit:
|
|
separately.append(attrib)
|
|
elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit:
|
|
separately.append(attrib)
|
|
|
|
# whatever's in `separately` or `ignore` at this point won't get pickled
|
|
for attrib in separately + list(ignore):
|
|
if hasattr(self, attrib):
|
|
asides[attrib] = getattr(self, attrib)
|
|
delattr(self, attrib)
|
|
|
|
recursive_saveloads = []
|
|
restores = []
|
|
for attrib, val in iteritems(self.__dict__):
|
|
if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading
|
|
recursive_saveloads.append(attrib)
|
|
cfname = '.'.join((fname, attrib))
|
|
restores.extend(val._save_specials(cfname, None, sep_limit, ignore, pickle_protocol, compress, subname))
|
|
|
|
try:
|
|
numpys, scipys, ignoreds = [], [], []
|
|
for attrib, val in iteritems(asides):
|
|
if isinstance(val, np.ndarray) and attrib not in ignore:
|
|
numpys.append(attrib)
|
|
logger.info("storing np array '%s' to %s", attrib, subname(fname, attrib))
|
|
|
|
if compress:
|
|
np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val))
|
|
else:
|
|
np.save(subname(fname, attrib), np.ascontiguousarray(val))
|
|
|
|
elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore:
|
|
scipys.append(attrib)
|
|
logger.info("storing scipy.sparse array '%s' under %s", attrib, subname(fname, attrib))
|
|
|
|
if compress:
|
|
np.savez_compressed(
|
|
subname(fname, attrib, 'sparse'),
|
|
data=val.data,
|
|
indptr=val.indptr,
|
|
indices=val.indices
|
|
)
|
|
else:
|
|
np.save(subname(fname, attrib, 'data'), val.data)
|
|
np.save(subname(fname, attrib, 'indptr'), val.indptr)
|
|
np.save(subname(fname, attrib, 'indices'), val.indices)
|
|
|
|
data, indptr, indices = val.data, val.indptr, val.indices
|
|
val.data, val.indptr, val.indices = None, None, None
|
|
|
|
try:
|
|
# store array-less object
|
|
pickle(val, subname(fname, attrib), protocol=pickle_protocol)
|
|
finally:
|
|
val.data, val.indptr, val.indices = data, indptr, indices
|
|
else:
|
|
logger.info("not storing attribute %s", attrib)
|
|
ignoreds.append(attrib)
|
|
|
|
self.__dict__['__numpys'] = numpys
|
|
self.__dict__['__scipys'] = scipys
|
|
self.__dict__['__ignoreds'] = ignoreds
|
|
self.__dict__['__recursive_saveloads'] = recursive_saveloads
|
|
except Exception:
|
|
# restore the attributes if exception-interrupted
|
|
for attrib, val in iteritems(asides):
|
|
setattr(self, attrib, val)
|
|
raise
|
|
return restores + [(self, asides)]
|
|
|
|
def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2):
|
|
"""Save the object to file.
|
|
|
|
Parameters
|
|
----------
|
|
fname_or_handle : str or file-like
|
|
Path to output file or already opened file-like object. If the object is a file handle,
|
|
no special array handling will be performed, all attributes will be saved to the same file.
|
|
separately : list of str or None, optional
|
|
If None - automatically detect large numpy/scipy.sparse arrays in the object being stored, and store
|
|
them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays
|
|
back on load efficiently.
|
|
If list of str - this attributes will be stored in separate files, the automatic check
|
|
is not performed in this case.
|
|
sep_limit : int
|
|
Limit for automatic separation.
|
|
ignore : frozenset of str
|
|
Attributes that shouldn't be serialize/store.
|
|
pickle_protocol : int
|
|
Protocol number for pickle.
|
|
|
|
See Also
|
|
--------
|
|
:meth:`~gensim.utils.SaveLoad.load`
|
|
|
|
"""
|
|
try:
|
|
_pickle.dump(self, fname_or_handle, protocol=pickle_protocol)
|
|
logger.info("saved %s object", self.__class__.__name__)
|
|
except TypeError: # `fname_or_handle` does not have write attribute
|
|
self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol)
|
|
|
|
|
|
def unpickle(fname):
|
|
"""Load object from `fname`.
|
|
|
|
Parameters
|
|
----------
|
|
fname : str
|
|
Path to pickle file.
|
|
|
|
Returns
|
|
-------
|
|
object
|
|
Python object loaded from `fname`.
|
|
|
|
"""
|
|
with smart_open(fname, 'rb') as f:
|
|
# Because of loading from S3 load can't be used (missing readline in smart_open)
|
|
file_bytes = f.read()
|
|
file_bytes = file_bytes.replace(b'gensim.models.word2vec', b'gensim.models.deprecated.word2vec')
|
|
file_bytes = file_bytes.replace(b'gensim.models.keyedvectors', b'gensim.models.deprecated.keyedvectors')
|
|
file_bytes = file_bytes.replace(b'gensim.models.doc2vec', b'gensim.models.deprecated.doc2vec')
|
|
file_bytes = file_bytes.replace(b'gensim.models.fasttext', b'gensim.models.deprecated.fasttext')
|
|
file_bytes = file_bytes.replace(
|
|
b'gensim.models.wrappers.fasttext', b'gensim.models.deprecated.fasttext_wrapper')
|
|
if sys.version_info > (3, 0):
|
|
return _pickle.loads(file_bytes, encoding='latin1')
|
|
else:
|
|
return _pickle.loads(file_bytes)
|
|
|
|
|
|
def pickle(obj, fname, protocol=2):
|
|
"""Pickle object `obj` to file `fname`.
|
|
|
|
Parameters
|
|
----------
|
|
obj : object
|
|
Any python object.
|
|
fname : str
|
|
Path to pickle file.
|
|
protocol : int, optional
|
|
Pickle protocol number, default is 2 to support compatible across python 2.x and 3.x.
|
|
|
|
"""
|
|
with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows
|
|
_pickle.dump(obj, fout, protocol=protocol)
|