2070 lines
63 KiB
Python
2070 lines
63 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
#
|
||
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
|
||
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
||
|
||
"""Various general utility functions."""
|
||
|
||
from __future__ import with_statement
|
||
from contextlib import contextmanager
|
||
import collections
|
||
import logging
|
||
import warnings
|
||
|
||
try:
|
||
from html.entities import name2codepoint as n2cp
|
||
except ImportError:
|
||
from htmlentitydefs import name2codepoint as n2cp
|
||
try:
|
||
import cPickle as _pickle
|
||
except ImportError:
|
||
import pickle as _pickle
|
||
|
||
import re
|
||
import unicodedata
|
||
import os
|
||
import random
|
||
import itertools
|
||
import tempfile
|
||
from functools import wraps
|
||
import multiprocessing
|
||
import shutil
|
||
import sys
|
||
import subprocess
|
||
import inspect
|
||
import heapq
|
||
|
||
import numpy as np
|
||
import numbers
|
||
import scipy.sparse
|
||
|
||
from six import iterkeys, iteritems, itervalues, u, string_types, unichr
|
||
from six.moves import xrange
|
||
|
||
from smart_open import smart_open
|
||
|
||
from multiprocessing import cpu_count
|
||
|
||
if sys.version_info[0] >= 3:
|
||
unicode = str
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE)
|
||
RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)
|
||
|
||
|
||
def get_random_state(seed):
|
||
"""Generate :class:`numpy.random.RandomState` based on input seed.
|
||
|
||
Parameters
|
||
----------
|
||
seed : {None, int, array_like}
|
||
Seed for random state.
|
||
|
||
Returns
|
||
-------
|
||
:class:`numpy.random.RandomState`
|
||
Random state.
|
||
|
||
Raises
|
||
------
|
||
AttributeError
|
||
If seed is not {None, int, array_like}.
|
||
|
||
Notes
|
||
-----
|
||
Method originally from `maciejkula/glove-python <https://github.com/maciejkula/glove-python>`_
|
||
and written by `@joshloyal <https://github.com/joshloyal>`_.
|
||
|
||
"""
|
||
if seed is None or seed is np.random:
|
||
return np.random.mtrand._rand
|
||
if isinstance(seed, (numbers.Integral, np.integer)):
|
||
return np.random.RandomState(seed)
|
||
if isinstance(seed, np.random.RandomState):
|
||
return seed
|
||
raise ValueError('%r cannot be used to seed a np.random.RandomState instance' % seed)
|
||
|
||
|
||
def synchronous(tlockname):
|
||
"""A decorator to place an instance-based lock around a method.
|
||
|
||
Notes
|
||
-----
|
||
Adapted from http://code.activestate.com/recipes/577105-synchronization-decorator-for-class-methods/.
|
||
|
||
"""
|
||
def _synched(func):
|
||
@wraps(func)
|
||
def _synchronizer(self, *args, **kwargs):
|
||
tlock = getattr(self, tlockname)
|
||
logger.debug("acquiring lock %r for %s", tlockname, func.__name__)
|
||
|
||
with tlock: # use lock as a context manager to perform safe acquire/release pairs
|
||
logger.debug("acquired lock %r for %s", tlockname, func.__name__)
|
||
result = func(self, *args, **kwargs)
|
||
logger.debug("releasing lock %r for %s", tlockname, func.__name__)
|
||
return result
|
||
return _synchronizer
|
||
return _synched
|
||
|
||
|
||
def file_or_filename(input):
|
||
"""Open a filename for reading with `smart_open`, or seek to the beginning if `input` is an already open file.
|
||
|
||
Parameters
|
||
----------
|
||
input : str or file-like
|
||
Filename or file-like object.
|
||
|
||
Returns
|
||
-------
|
||
file-like object
|
||
An open file, positioned at the beginning.
|
||
|
||
"""
|
||
if isinstance(input, string_types):
|
||
# input was a filename: open as file
|
||
return smart_open(input)
|
||
else:
|
||
# input already a file-like object; just reset to the beginning
|
||
input.seek(0)
|
||
return input
|
||
|
||
|
||
@contextmanager
|
||
def open_file(input):
|
||
"""Provide "with-like" behaviour without closing the file object.
|
||
|
||
Parameters
|
||
----------
|
||
input : str or file-like
|
||
Filename or file-like object.
|
||
|
||
Yields
|
||
-------
|
||
file
|
||
File-like object based on input (or input if this already file-like).
|
||
|
||
"""
|
||
mgr = file_or_filename(input)
|
||
exc = False
|
||
try:
|
||
yield mgr
|
||
except Exception:
|
||
# Handling any unhandled exceptions from the code nested in 'with' statement.
|
||
exc = True
|
||
if not isinstance(input, string_types) or not mgr.__exit__(*sys.exc_info()):
|
||
raise
|
||
# Try to introspect and silence errors.
|
||
finally:
|
||
if not exc and isinstance(input, string_types):
|
||
mgr.__exit__(None, None, None)
|
||
|
||
|
||
def deaccent(text):
|
||
"""Remove letter accents from the given string.
|
||
|
||
Parameters
|
||
----------
|
||
text : str
|
||
Input string.
|
||
|
||
Returns
|
||
-------
|
||
str
|
||
Unicode string without accents.
|
||
|
||
Examples
|
||
--------
|
||
>>> from gensim.utils import deaccent
|
||
>>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
|
||
u'Sef chomutovskych komunistu dostal postou bily prasek'
|
||
|
||
"""
|
||
if not isinstance(text, unicode):
|
||
# assume utf8 for byte strings, use default (strict) error handling
|
||
text = text.decode('utf8')
|
||
norm = unicodedata.normalize("NFD", text)
|
||
result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
|
||
return unicodedata.normalize("NFC", result)
|
||
|
||
|
||
def copytree_hardlink(source, dest):
|
||
"""Recursively copy a directory ala shutils.copytree, but hardlink files instead of copying.
|
||
|
||
Parameters
|
||
----------
|
||
source : str
|
||
Path to source directory
|
||
dest : str
|
||
Path to destination directory
|
||
|
||
Warnings
|
||
--------
|
||
Available on UNIX systems only.
|
||
|
||
"""
|
||
copy2 = shutil.copy2
|
||
try:
|
||
shutil.copy2 = os.link
|
||
shutil.copytree(source, dest)
|
||
finally:
|
||
shutil.copy2 = copy2
|
||
|
||
|
||
def tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict", to_lower=False, lower=False):
|
||
"""Iteratively yield tokens as unicode strings, optionally removing accent marks and lowercasing it.
|
||
|
||
Parameters
|
||
----------
|
||
text : str or bytes
|
||
Input string.
|
||
deacc : bool, optional
|
||
Remove accentuation using :func:`~gensim.utils.deaccent`?
|
||
encoding : str, optional
|
||
Encoding of input string, used as parameter for :func:`~gensim.utils.to_unicode`.
|
||
errors : str, optional
|
||
Error handling behaviour, used as parameter for :func:`~gensim.utils.to_unicode`.
|
||
lowercase : bool, optional
|
||
Lowercase the input string?
|
||
to_lower : bool, optional
|
||
Same as `lowercase`. Convenience alias.
|
||
lower : bool, optional
|
||
Same as `lowercase`. Convenience alias.
|
||
|
||
Yields
|
||
------
|
||
str
|
||
Contiguous sequences of alphabetic characters (no digits!), using :func:`~gensim.utils.simple_tokenize`
|
||
|
||
Examples
|
||
--------
|
||
>>> from gensim.utils import tokenize
|
||
>>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc=True))
|
||
[u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']
|
||
|
||
"""
|
||
lowercase = lowercase or to_lower or lower
|
||
text = to_unicode(text, encoding, errors=errors)
|
||
if lowercase:
|
||
text = text.lower()
|
||
if deacc:
|
||
text = deaccent(text)
|
||
return simple_tokenize(text)
|
||
|
||
|
||
def simple_tokenize(text):
|
||
"""Tokenize input test using :const:`gensim.utils.PAT_ALPHABETIC`.
|
||
|
||
Parameters
|
||
----------
|
||
text : str
|
||
Input text.
|
||
|
||
Yields
|
||
------
|
||
str
|
||
Tokens from `text`.
|
||
|
||
"""
|
||
for match in PAT_ALPHABETIC.finditer(text):
|
||
yield match.group()
|
||
|
||
|
||
def simple_preprocess(doc, deacc=False, min_len=2, max_len=15):
|
||
"""Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
|
||
|
||
Uses :func:`~gensim.utils.tokenize` internally.
|
||
|
||
Parameters
|
||
----------
|
||
doc : str
|
||
Input document.
|
||
deacc : bool, optional
|
||
Remove accent marks from tokens using :func:`~gensim.utils.deaccent`?
|
||
min_len : int, optional
|
||
Minimum length of token (inclusive). Shorter tokens are discarded.
|
||
max_len : int, optional
|
||
Maximum length of token in result (inclusive). Longer tokens are discarded.
|
||
|
||
Returns
|
||
-------
|
||
list of str
|
||
Tokens extracted from `doc`.
|
||
|
||
"""
|
||
tokens = [
|
||
token for token in tokenize(doc, lower=True, deacc=deacc, errors='ignore')
|
||
if min_len <= len(token) <= max_len and not token.startswith('_')
|
||
]
|
||
return tokens
|
||
|
||
|
||
def any2utf8(text, errors='strict', encoding='utf8'):
|
||
"""Convert a unicode or bytes string in the given encoding into a utf8 bytestring.
|
||
|
||
Parameters
|
||
----------
|
||
text : str
|
||
Input text.
|
||
errors : str, optional
|
||
Error handling behaviour if `text` is a bytestring.
|
||
encoding : str, optional
|
||
Encoding of `text` if it is a bytestring.
|
||
|
||
Returns
|
||
-------
|
||
str
|
||
Bytestring in utf8.
|
||
|
||
"""
|
||
|
||
if isinstance(text, unicode):
|
||
return text.encode('utf8')
|
||
# do bytestring -> unicode -> utf8 full circle, to ensure valid utf8
|
||
return unicode(text, encoding, errors=errors).encode('utf8')
|
||
|
||
|
||
to_utf8 = any2utf8
|
||
|
||
|
||
def any2unicode(text, encoding='utf8', errors='strict'):
|
||
"""Convert `text` (bytestring in given encoding or unicode) to unicode.
|
||
|
||
Parameters
|
||
----------
|
||
text : str
|
||
Input text.
|
||
errors : str, optional
|
||
Error handling behaviour if `text` is a bytestring.
|
||
encoding : str, optional
|
||
Encoding of `text` if it is a bytestring.
|
||
|
||
Returns
|
||
-------
|
||
str
|
||
Unicode version of `text`.
|
||
|
||
"""
|
||
if isinstance(text, unicode):
|
||
return text
|
||
return unicode(text, encoding, errors=errors)
|
||
|
||
|
||
to_unicode = any2unicode
|
||
|
||
|
||
def call_on_class_only(*args, **kwargs):
|
||
"""Helper to raise `AttributeError` if a class method is called on an instance. Used internally.
|
||
|
||
Parameters
|
||
----------
|
||
*args
|
||
Variable length argument list.
|
||
**kwargs
|
||
Arbitrary keyword arguments.
|
||
|
||
Raises
|
||
------
|
||
AttributeError
|
||
If a class method is called on an instance.
|
||
|
||
"""
|
||
raise AttributeError('This method should be called on a class object.')
|
||
|
||
|
||
class SaveLoad(object):
|
||
"""Serialize/deserialize object from disk, by equipping objects with the save()/load() methods.
|
||
|
||
Warnings
|
||
--------
|
||
This uses pickle internally (among other techniques), so objects must not contain unpicklable attributes
|
||
such as lambda functions etc.
|
||
|
||
"""
|
||
@classmethod
|
||
def load(cls, fname, mmap=None):
|
||
"""Load an object previously saved using :meth:`~gensim.utils.SaveLoad.save` from a file.
|
||
|
||
Parameters
|
||
----------
|
||
fname : str
|
||
Path to file that contains needed object.
|
||
mmap : str, optional
|
||
Memory-map option. If the object was saved with large arrays stored separately, you can load these arrays
|
||
via mmap (shared memory) using `mmap='r'.
|
||
If the file being loaded is compressed (either '.gz' or '.bz2'), then `mmap=None` **must be** set.
|
||
|
||
See Also
|
||
--------
|
||
:meth:`~gensim.utils.SaveLoad.save`
|
||
Save object to file.
|
||
|
||
Returns
|
||
-------
|
||
object
|
||
Object loaded from `fname`.
|
||
|
||
Raises
|
||
------
|
||
AttributeError
|
||
When called on an object instance instead of class (this is a class method).
|
||
|
||
"""
|
||
logger.info("loading %s object from %s", cls.__name__, fname)
|
||
|
||
compress, subname = SaveLoad._adapt_by_suffix(fname)
|
||
|
||
obj = unpickle(fname)
|
||
obj._load_specials(fname, mmap, compress, subname)
|
||
logger.info("loaded %s", fname)
|
||
return obj
|
||
|
||
def _load_specials(self, fname, mmap, compress, subname):
|
||
"""Load attributes that were stored separately, and give them the same opportunity
|
||
to recursively load using the :class:`~gensim.utils.SaveLoad` interface.
|
||
|
||
Parameters
|
||
----------
|
||
fname : str
|
||
Input file path.
|
||
mmap : {None, ‘r+’, ‘r’, ‘w+’, ‘c’}
|
||
Memory-map options. See `numpy.load(mmap_mode)
|
||
<https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.load.html>`_.
|
||
compress : bool
|
||
Is the input file compressed?
|
||
subname : str
|
||
Attribute name. Set automatically during recursive processing.
|
||
|
||
"""
|
||
def mmap_error(obj, filename):
|
||
return IOError(
|
||
'Cannot mmap compressed object %s in file %s. ' % (obj, filename) +
|
||
'Use `load(fname, mmap=None)` or uncompress files manually.'
|
||
)
|
||
|
||
for attrib in getattr(self, '__recursive_saveloads', []):
|
||
cfname = '.'.join((fname, attrib))
|
||
logger.info("loading %s recursively from %s.* with mmap=%s", attrib, cfname, mmap)
|
||
getattr(self, attrib)._load_specials(cfname, mmap, compress, subname)
|
||
|
||
for attrib in getattr(self, '__numpys', []):
|
||
logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap)
|
||
|
||
if compress:
|
||
if mmap:
|
||
raise mmap_error(attrib, subname(fname, attrib))
|
||
|
||
val = np.load(subname(fname, attrib))['val']
|
||
else:
|
||
val = np.load(subname(fname, attrib), mmap_mode=mmap)
|
||
|
||
setattr(self, attrib, val)
|
||
|
||
for attrib in getattr(self, '__scipys', []):
|
||
logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap)
|
||
sparse = unpickle(subname(fname, attrib))
|
||
if compress:
|
||
if mmap:
|
||
raise mmap_error(attrib, subname(fname, attrib))
|
||
|
||
with np.load(subname(fname, attrib, 'sparse')) as f:
|
||
sparse.data = f['data']
|
||
sparse.indptr = f['indptr']
|
||
sparse.indices = f['indices']
|
||
else:
|
||
sparse.data = np.load(subname(fname, attrib, 'data'), mmap_mode=mmap)
|
||
sparse.indptr = np.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap)
|
||
sparse.indices = np.load(subname(fname, attrib, 'indices'), mmap_mode=mmap)
|
||
|
||
setattr(self, attrib, sparse)
|
||
|
||
for attrib in getattr(self, '__ignoreds', []):
|
||
logger.info("setting ignored attribute %s to None", attrib)
|
||
setattr(self, attrib, None)
|
||
|
||
@staticmethod
|
||
def _adapt_by_suffix(fname):
|
||
"""Get compress setting and filename for numpy file compression.
|
||
|
||
Parameters
|
||
----------
|
||
fname : str
|
||
Input filename.
|
||
|
||
Returns
|
||
-------
|
||
(bool, function)
|
||
First argument will be True if `fname` compressed.
|
||
|
||
"""
|
||
compress, suffix = (True, 'npz') if fname.endswith('.gz') or fname.endswith('.bz2') else (False, 'npy')
|
||
return compress, lambda *args: '.'.join(args + (suffix,))
|
||
|
||
def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2):
|
||
"""Save the object to a file. Used internally by :meth:`gensim.utils.SaveLoad.save()`.
|
||
|
||
Parameters
|
||
----------
|
||
fname : str
|
||
Path to file.
|
||
separately : list, optional
|
||
Iterable of attributes than need to store distinctly.
|
||
sep_limit : int, optional
|
||
Limit for separation.
|
||
ignore : frozenset, optional
|
||
Attributes that shouldn't be store.
|
||
pickle_protocol : int, optional
|
||
Protocol number for pickle.
|
||
|
||
Notes
|
||
-----
|
||
If `separately` is None, automatically detect large numpy/scipy.sparse arrays in the object being stored,
|
||
and store them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays back
|
||
on load efficiently.
|
||
|
||
You can also set `separately` manually, in which case it must be a list of attribute names to be stored
|
||
in separate files. The automatic check is not performed in this case.
|
||
|
||
"""
|
||
logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately)
|
||
|
||
compress, subname = SaveLoad._adapt_by_suffix(fname)
|
||
|
||
restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol,
|
||
compress, subname)
|
||
try:
|
||
pickle(self, fname, protocol=pickle_protocol)
|
||
finally:
|
||
# restore attribs handled specially
|
||
for obj, asides in restores:
|
||
for attrib, val in iteritems(asides):
|
||
setattr(obj, attrib, val)
|
||
logger.info("saved %s", fname)
|
||
|
||
def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname):
|
||
"""Save aside any attributes that need to be handled separately, including
|
||
by recursion any attributes that are themselves :class:`~gensim.utils.SaveLoad` instances.
|
||
|
||
Parameters
|
||
----------
|
||
fname : str
|
||
Output filename.
|
||
separately : list or None
|
||
List of attributes to store separately.
|
||
sep_limit : int
|
||
Don't store arrays smaller than this separately. In bytes.
|
||
ignore : iterable of str
|
||
Attributes that shouldn't be stored at all.
|
||
pickle_protocol : int
|
||
Protocol number for pickle.
|
||
compress : bool
|
||
If True - compress output with :func:`numpy.savez_compressed`.
|
||
subname : function
|
||
Produced by :meth:`~gensim.utils.SaveLoad._adapt_by_suffix`
|
||
|
||
Returns
|
||
-------
|
||
list of (obj, {attrib: value, ...})
|
||
Settings that the caller should use to restore each object's attributes that were set aside
|
||
during the default :func:`~gensim.utils.pickle`.
|
||
|
||
"""
|
||
asides = {}
|
||
sparse_matrices = (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)
|
||
if separately is None:
|
||
separately = []
|
||
for attrib, val in iteritems(self.__dict__):
|
||
if isinstance(val, np.ndarray) and val.size >= sep_limit:
|
||
separately.append(attrib)
|
||
elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit:
|
||
separately.append(attrib)
|
||
|
||
# whatever's in `separately` or `ignore` at this point won't get pickled
|
||
for attrib in separately + list(ignore):
|
||
if hasattr(self, attrib):
|
||
asides[attrib] = getattr(self, attrib)
|
||
delattr(self, attrib)
|
||
|
||
recursive_saveloads = []
|
||
restores = []
|
||
for attrib, val in iteritems(self.__dict__):
|
||
if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading
|
||
recursive_saveloads.append(attrib)
|
||
cfname = '.'.join((fname, attrib))
|
||
restores.extend(val._save_specials(cfname, None, sep_limit, ignore, pickle_protocol, compress, subname))
|
||
|
||
try:
|
||
numpys, scipys, ignoreds = [], [], []
|
||
for attrib, val in iteritems(asides):
|
||
if isinstance(val, np.ndarray) and attrib not in ignore:
|
||
numpys.append(attrib)
|
||
logger.info("storing np array '%s' to %s", attrib, subname(fname, attrib))
|
||
|
||
if compress:
|
||
np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val))
|
||
else:
|
||
np.save(subname(fname, attrib), np.ascontiguousarray(val))
|
||
|
||
elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore:
|
||
scipys.append(attrib)
|
||
logger.info("storing scipy.sparse array '%s' under %s", attrib, subname(fname, attrib))
|
||
|
||
if compress:
|
||
np.savez_compressed(
|
||
subname(fname, attrib, 'sparse'),
|
||
data=val.data,
|
||
indptr=val.indptr,
|
||
indices=val.indices
|
||
)
|
||
else:
|
||
np.save(subname(fname, attrib, 'data'), val.data)
|
||
np.save(subname(fname, attrib, 'indptr'), val.indptr)
|
||
np.save(subname(fname, attrib, 'indices'), val.indices)
|
||
|
||
data, indptr, indices = val.data, val.indptr, val.indices
|
||
val.data, val.indptr, val.indices = None, None, None
|
||
|
||
try:
|
||
# store array-less object
|
||
pickle(val, subname(fname, attrib), protocol=pickle_protocol)
|
||
finally:
|
||
val.data, val.indptr, val.indices = data, indptr, indices
|
||
else:
|
||
logger.info("not storing attribute %s", attrib)
|
||
ignoreds.append(attrib)
|
||
|
||
self.__dict__['__numpys'] = numpys
|
||
self.__dict__['__scipys'] = scipys
|
||
self.__dict__['__ignoreds'] = ignoreds
|
||
self.__dict__['__recursive_saveloads'] = recursive_saveloads
|
||
except Exception:
|
||
# restore the attributes if exception-interrupted
|
||
for attrib, val in iteritems(asides):
|
||
setattr(self, attrib, val)
|
||
raise
|
||
return restores + [(self, asides)]
|
||
|
||
def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2):
|
||
"""Save the object to a file.
|
||
|
||
Parameters
|
||
----------
|
||
fname_or_handle : str or file-like
|
||
Path to output file or already opened file-like object. If the object is a file handle,
|
||
no special array handling will be performed, all attributes will be saved to the same file.
|
||
separately : list of str or None, optional
|
||
If None, automatically detect large numpy/scipy.sparse arrays in the object being stored, and store
|
||
them into separate files. This prevent memory errors for large objects, and also allows
|
||
`memory-mapping <https://en.wikipedia.org/wiki/Mmap>`_ the large arrays for efficient
|
||
loading and sharing the large arrays in RAM between multiple processes.
|
||
|
||
If list of str: store these attributes into separate files. The automated size check
|
||
is not performed in this case.
|
||
sep_limit : int, optional
|
||
Don't store arrays smaller than this separately. In bytes.
|
||
ignore : frozenset of str, optional
|
||
Attributes that shouldn't be stored at all.
|
||
pickle_protocol : int, optional
|
||
Protocol number for pickle.
|
||
|
||
See Also
|
||
--------
|
||
:meth:`~gensim.utils.SaveLoad.load`
|
||
Load object from file.
|
||
|
||
"""
|
||
try:
|
||
_pickle.dump(self, fname_or_handle, protocol=pickle_protocol)
|
||
logger.info("saved %s object", self.__class__.__name__)
|
||
except TypeError: # `fname_or_handle` does not have write attribute
|
||
self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol)
|
||
|
||
|
||
def identity(p):
|
||
"""Identity fnc, for flows that don't accept lambda (pickling etc).
|
||
|
||
Parameters
|
||
----------
|
||
p : object
|
||
Input parameter.
|
||
|
||
Returns
|
||
-------
|
||
object
|
||
Same as `p`.
|
||
|
||
"""
|
||
return p
|
||
|
||
|
||
def get_max_id(corpus):
|
||
"""Get the highest feature id that appears in the corpus.
|
||
|
||
Parameters
|
||
----------
|
||
corpus : iterable of iterable of (int, numeric)
|
||
Collection of texts in BoW format.
|
||
|
||
Returns
|
||
------
|
||
int
|
||
Highest feature id.
|
||
|
||
Notes
|
||
-----
|
||
For empty `corpus` return -1.
|
||
|
||
"""
|
||
maxid = -1
|
||
for document in corpus:
|
||
maxid = max(maxid, max([-1] + [fieldid for fieldid, _ in document])) # [-1] to avoid exceptions from max(empty)
|
||
return maxid
|
||
|
||
|
||
class FakeDict(object):
|
||
"""Objects of this class act as dictionaries that map integer->str(integer), for a specified
|
||
range of integers <0, num_terms).
|
||
|
||
This is meant to avoid allocating real dictionaries when `num_terms` is huge, which is a waste of memory.
|
||
|
||
"""
|
||
def __init__(self, num_terms):
|
||
"""
|
||
|
||
Parameters
|
||
----------
|
||
num_terms : int
|
||
Number of terms.
|
||
|
||
"""
|
||
self.num_terms = num_terms
|
||
|
||
def __str__(self):
|
||
return "FakeDict(num_terms=%s)" % self.num_terms
|
||
|
||
def __getitem__(self, val):
|
||
if 0 <= val < self.num_terms:
|
||
return str(val)
|
||
raise ValueError("internal id out of bounds (%s, expected <0..%s))" % (val, self.num_terms))
|
||
|
||
def iteritems(self):
|
||
"""Iterate over all keys and values.
|
||
|
||
|
||
Yields
|
||
------
|
||
(int, str)
|
||
Pair of (id, token).
|
||
|
||
"""
|
||
for i in xrange(self.num_terms):
|
||
yield i, str(i)
|
||
|
||
def keys(self):
|
||
"""Override the `dict.keys()`, which is used to determine the maximum internal id of a corpus,
|
||
i.e. the vocabulary dimensionality.
|
||
|
||
Returns
|
||
-------
|
||
list of int
|
||
Highest id, packed in list.
|
||
|
||
Notes
|
||
-----
|
||
To avoid materializing the whole `range(0, self.num_terms)`,
|
||
this returns the highest id = `[self.num_terms - 1]` only.
|
||
|
||
"""
|
||
return [self.num_terms - 1]
|
||
|
||
def __len__(self):
|
||
return self.num_terms
|
||
|
||
def get(self, val, default=None):
|
||
if 0 <= val < self.num_terms:
|
||
return str(val)
|
||
return default
|
||
|
||
|
||
def dict_from_corpus(corpus):
|
||
"""Scan corpus for all word ids that appear in it, then construct a mapping
|
||
which maps each `word_id` -> `str(word_id)`.
|
||
|
||
Parameters
|
||
----------
|
||
corpus : iterable of iterable of (int, numeric)
|
||
Collection of texts in BoW format.
|
||
|
||
Returns
|
||
------
|
||
id2word : :class:`~gensim.utils.FakeDict`
|
||
"Fake" mapping which maps each `word_id` -> `str(word_id)`.
|
||
|
||
Warnings
|
||
--------
|
||
This function is used whenever *words* need to be displayed (as opposed to just their ids)
|
||
but no `word_id` -> `word` mapping was provided. The resulting mapping only covers words actually
|
||
used in the corpus, up to the highest `word_id` found.
|
||
|
||
"""
|
||
num_terms = 1 + get_max_id(corpus)
|
||
id2word = FakeDict(num_terms)
|
||
return id2word
|
||
|
||
|
||
def is_corpus(obj):
|
||
"""Check whether `obj` is a corpus, by peeking at its first element. Works even on streamed generators.
|
||
The peeked element is put back into a object returned by this function, so always use
|
||
that returned object instead of the original `obj`.
|
||
|
||
Parameters
|
||
----------
|
||
obj : object
|
||
An `iterable of iterable` that contains (int, numeric).
|
||
|
||
Returns
|
||
-------
|
||
(bool, object)
|
||
Pair of (is `obj` a corpus, `obj` with peeked element restored)
|
||
|
||
Examples
|
||
--------
|
||
>>> from gensim.utils import is_corpus
|
||
>>> corpus = [[(1, 1.0)], [(2, -0.3), (3, 0.12)]]
|
||
>>> corpus_or_not, corpus = is_corpus(corpus)
|
||
|
||
Warnings
|
||
--------
|
||
An "empty" corpus (empty input sequence) is ambiguous, so in this case
|
||
the result is forcefully defined as (False, `obj`).
|
||
|
||
"""
|
||
try:
|
||
if 'Corpus' in obj.__class__.__name__: # the most common case, quick hack
|
||
return True, obj
|
||
except Exception:
|
||
pass
|
||
try:
|
||
if hasattr(obj, 'next') or hasattr(obj, '__next__'):
|
||
# the input is an iterator object, meaning once we call next()
|
||
# that element could be gone forever. we must be careful to put
|
||
# whatever we retrieve back again
|
||
doc1 = next(obj)
|
||
obj = itertools.chain([doc1], obj)
|
||
else:
|
||
doc1 = next(iter(obj)) # empty corpus is resolved to False here
|
||
if len(doc1) == 0: # sparse documents must have a __len__ function (list, tuple...)
|
||
return True, obj # the first document is empty=>assume this is a corpus
|
||
|
||
# if obj is a 1D numpy array(scalars) instead of 2-tuples, it resolves to False here
|
||
id1, val1 = next(iter(doc1))
|
||
id1, val1 = int(id1), float(val1) # must be a 2-tuple (integer, float)
|
||
except Exception:
|
||
return False, obj
|
||
return True, obj
|
||
|
||
|
||
def get_my_ip():
|
||
"""Try to obtain our external ip (from the Pyro4 nameserver's point of view)
|
||
|
||
Returns
|
||
-------
|
||
str
|
||
IP address.
|
||
|
||
Warnings
|
||
--------
|
||
This tries to sidestep the issue of bogus `/etc/hosts` entries and other local misconfiguration,
|
||
which often mess up hostname resolution.
|
||
If all else fails, fall back to simple `socket.gethostbyname()` lookup.
|
||
|
||
"""
|
||
import socket
|
||
try:
|
||
from Pyro4.naming import locateNS
|
||
# we know the nameserver must exist, so use it as our anchor point
|
||
ns = locateNS()
|
||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||
s.connect((ns._pyroUri.host, ns._pyroUri.port))
|
||
result, port = s.getsockname()
|
||
except Exception:
|
||
try:
|
||
# see what ifconfig says about our default interface
|
||
import commands
|
||
result = commands.getoutput("ifconfig").split("\n")[1].split()[1][5:]
|
||
if len(result.split('.')) != 4:
|
||
raise Exception()
|
||
except Exception:
|
||
# give up, leave the resolution to gethostbyname
|
||
result = socket.gethostbyname(socket.gethostname())
|
||
return result
|
||
|
||
|
||
class RepeatCorpus(SaveLoad):
|
||
"""Wrap a `corpus` as another corpus of length `reps`. This is achieved by repeating documents from `corpus`
|
||
over and over again, until the requested length `len(result) == reps` is reached.
|
||
Repetition is done on-the-fly=efficiently, via `itertools`.
|
||
|
||
Examples
|
||
--------
|
||
>>> from gensim.utils import RepeatCorpus
|
||
>>>
|
||
>>> corpus = [[(1, 2)], []] # 2 documents
|
||
>>> list(RepeatCorpus(corpus, 5)) # repeat 2.5 times to get 5 documents
|
||
[[(1, 2)], [], [(1, 2)], [], [(1, 2)]]
|
||
|
||
"""
|
||
def __init__(self, corpus, reps):
|
||
"""
|
||
|
||
Parameters
|
||
----------
|
||
corpus : iterable of iterable of (int, numeric)
|
||
Input corpus.
|
||
reps : int
|
||
Number of repeats for documents from corpus.
|
||
|
||
"""
|
||
self.corpus = corpus
|
||
self.reps = reps
|
||
|
||
def __iter__(self):
|
||
return itertools.islice(itertools.cycle(self.corpus), self.reps)
|
||
|
||
|
||
class RepeatCorpusNTimes(SaveLoad):
|
||
"""Wrap a `corpus` and repeat it `n` times.
|
||
|
||
Examples
|
||
--------
|
||
>>> from gensim.utils import RepeatCorpusNTimes
|
||
>>>
|
||
>>> corpus = [[(1, 0.5)], []]
|
||
>>> list(RepeatCorpusNTimes(corpus, 3)) # repeat 3 times
|
||
[[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)], []]
|
||
|
||
"""
|
||
def __init__(self, corpus, n):
|
||
"""
|
||
|
||
Parameters
|
||
----------
|
||
corpus : iterable of iterable of (int, numeric)
|
||
Input corpus.
|
||
n : int
|
||
Number of repeats for corpus.
|
||
|
||
"""
|
||
self.corpus = corpus
|
||
self.n = n
|
||
|
||
def __iter__(self):
|
||
for _ in xrange(self.n):
|
||
for document in self.corpus:
|
||
yield document
|
||
|
||
|
||
class ClippedCorpus(SaveLoad):
|
||
"""Wrap a `corpus` and return `max_doc` element from it."""
|
||
def __init__(self, corpus, max_docs=None):
|
||
"""
|
||
|
||
Parameters
|
||
----------
|
||
corpus : iterable of iterable of (int, numeric)
|
||
Input corpus.
|
||
max_docs : int
|
||
Maximum number of documents in the wrapped corpus.
|
||
|
||
Warnings
|
||
--------
|
||
Any documents after `max_docs` are ignored. This effectively limits the length of the returned corpus
|
||
to <= `max_docs`. Set `max_docs=None` for "no limit", effectively wrapping the entire input corpus.
|
||
|
||
"""
|
||
self.corpus = corpus
|
||
self.max_docs = max_docs
|
||
|
||
def __iter__(self):
|
||
return itertools.islice(self.corpus, self.max_docs)
|
||
|
||
def __len__(self):
|
||
return min(self.max_docs, len(self.corpus))
|
||
|
||
|
||
class SlicedCorpus(SaveLoad):
|
||
"""Wrap `corpus` and return a slice of it."""
|
||
def __init__(self, corpus, slice_):
|
||
"""
|
||
|
||
Parameters
|
||
----------
|
||
corpus : iterable of iterable of (int, numeric)
|
||
Input corpus.
|
||
slice_ : slice or iterable
|
||
Slice for `corpus`.
|
||
|
||
Notes
|
||
-----
|
||
Negative slicing can only be used if the corpus is indexable, otherwise, the corpus will be iterated over.
|
||
Slice can also be a np.ndarray to support fancy indexing.
|
||
|
||
Calculating the size of a SlicedCorpus is expensive when using a slice as the corpus has
|
||
to be iterated over once. Using a list or np.ndarray does not have this drawback, but consumes more memory.
|
||
|
||
"""
|
||
self.corpus = corpus
|
||
self.slice_ = slice_
|
||
self.length = None
|
||
|
||
def __iter__(self):
|
||
if hasattr(self.corpus, 'index') and len(self.corpus.index) > 0:
|
||
return (self.corpus.docbyoffset(i) for i in self.corpus.index[self.slice_])
|
||
return itertools.islice(self.corpus, self.slice_.start, self.slice_.stop, self.slice_.step)
|
||
|
||
def __len__(self):
|
||
# check cached length, calculate if needed
|
||
if self.length is None:
|
||
if isinstance(self.slice_, (list, np.ndarray)):
|
||
self.length = len(self.slice_)
|
||
elif isinstance(self.slice_, slice):
|
||
(start, end, step) = self.slice_.indices(len(self.corpus.index))
|
||
diff = end - start
|
||
self.length = diff // step + (diff % step > 0)
|
||
else:
|
||
self.length = sum(1 for x in self)
|
||
|
||
return self.length
|
||
|
||
|
||
def safe_unichr(intval):
|
||
"""Create a unicode character from its integer value. In case `unichr` fails, render the character
|
||
as an escaped `\\U<8-byte hex value of intval>` string.
|
||
|
||
Parameters
|
||
----------
|
||
intval : int
|
||
Integer code of character
|
||
|
||
Returns
|
||
-------
|
||
string
|
||
Unicode string of character
|
||
|
||
"""
|
||
try:
|
||
return unichr(intval)
|
||
except ValueError:
|
||
# ValueError: unichr() arg not in range(0x10000) (narrow Python build)
|
||
s = "\\U%08x" % intval
|
||
# return UTF16 surrogate pair
|
||
return s.decode('unicode-escape')
|
||
|
||
|
||
def decode_htmlentities(text):
|
||
"""Decode all HTML entities in text that are encoded as hex, decimal or named entities.
|
||
Adapted from `python-twitter-ircbot/html_decode.py
|
||
<http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py>`_.
|
||
|
||
Parameters
|
||
----------
|
||
text : str
|
||
Input HTML.
|
||
|
||
Examples
|
||
--------
|
||
>>> from gensim.utils import decode_htmlentities
|
||
>>>
|
||
>>> u = u'E tu vivrai nel terrore - L'aldilà (1981)'
|
||
>>> print(decode_htmlentities(u).encode('UTF-8'))
|
||
E tu vivrai nel terrore - L'aldilà (1981)
|
||
>>> print(decode_htmlentities("l'eau"))
|
||
l'eau
|
||
>>> print(decode_htmlentities("foo < bar"))
|
||
foo < bar
|
||
|
||
"""
|
||
def substitute_entity(match):
|
||
try:
|
||
ent = match.group(3)
|
||
if match.group(1) == "#":
|
||
# decoding by number
|
||
if match.group(2) == '':
|
||
# number is in decimal
|
||
return safe_unichr(int(ent))
|
||
elif match.group(2) in ['x', 'X']:
|
||
# number is in hex
|
||
return safe_unichr(int(ent, 16))
|
||
else:
|
||
# they were using a name
|
||
cp = n2cp.get(ent)
|
||
if cp:
|
||
return safe_unichr(cp)
|
||
else:
|
||
return match.group()
|
||
except Exception:
|
||
# in case of errors, return original input
|
||
return match.group()
|
||
|
||
return RE_HTML_ENTITY.sub(substitute_entity, text)
|
||
|
||
|
||
def chunkize_serial(iterable, chunksize, as_numpy=False, dtype=np.float32):
|
||
"""Yield elements from `iterable` in "chunksize"-ed groups.
|
||
|
||
The last returned element may be smaller if the length of collection is not divisible by `chunksize`.
|
||
|
||
Parameters
|
||
----------
|
||
iterable : iterable of object
|
||
An iterable.
|
||
chunksize : int
|
||
Split iterable into chunks of this size.
|
||
as_numpy : bool, optional
|
||
Yield chunks as `np.ndarray` instead of lists.
|
||
|
||
Yields
|
||
------
|
||
list OR np.ndarray
|
||
"chunksize"-ed chunks of elements from `iterable`.
|
||
|
||
Examples
|
||
--------
|
||
>>> print(list(grouper(range(10), 3)))
|
||
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
|
||
|
||
"""
|
||
it = iter(iterable)
|
||
while True:
|
||
if as_numpy:
|
||
# convert each document to a 2d numpy array (~6x faster when transmitting
|
||
# chunk data over the wire, in Pyro)
|
||
wrapped_chunk = [[np.array(doc, dtype=dtype) for doc in itertools.islice(it, int(chunksize))]]
|
||
else:
|
||
wrapped_chunk = [list(itertools.islice(it, int(chunksize)))]
|
||
if not wrapped_chunk[0]:
|
||
break
|
||
# memory opt: wrap the chunk and then pop(), to avoid leaving behind a dangling reference
|
||
yield wrapped_chunk.pop()
|
||
|
||
|
||
grouper = chunkize_serial
|
||
|
||
|
||
class InputQueue(multiprocessing.Process):
|
||
"""Populate a queue of input chunks from a streamed corpus.
|
||
|
||
Useful for reading and chunking corpora in the background, in a separate process,
|
||
so that workers that use the queue are not starved for input chunks.
|
||
|
||
"""
|
||
def __init__(self, q, corpus, chunksize, maxsize, as_numpy):
|
||
"""
|
||
Parameters
|
||
----------
|
||
q : multiprocessing.Queue
|
||
Enqueue chunks into this queue.
|
||
corpus : iterable of iterable of (int, numeric)
|
||
Corpus to read and split into "chunksize"-ed groups
|
||
chunksize : int
|
||
Split `corpus` into chunks of this size.
|
||
as_numpy : bool, optional
|
||
Enqueue chunks as `numpy.ndarray` instead of lists.
|
||
|
||
"""
|
||
super(InputQueue, self).__init__()
|
||
self.q = q
|
||
self.maxsize = maxsize
|
||
self.corpus = corpus
|
||
self.chunksize = chunksize
|
||
self.as_numpy = as_numpy
|
||
|
||
def run(self):
|
||
it = iter(self.corpus)
|
||
while True:
|
||
chunk = itertools.islice(it, self.chunksize)
|
||
if self.as_numpy:
|
||
# HACK XXX convert documents to numpy arrays, to save memory.
|
||
# This also gives a scipy warning at runtime:
|
||
# "UserWarning: indices array has non-integer dtype (float64)"
|
||
wrapped_chunk = [[np.asarray(doc) for doc in chunk]]
|
||
else:
|
||
wrapped_chunk = [list(chunk)]
|
||
|
||
if not wrapped_chunk[0]:
|
||
self.q.put(None, block=True)
|
||
break
|
||
|
||
try:
|
||
qsize = self.q.qsize()
|
||
except NotImplementedError:
|
||
qsize = '?'
|
||
logger.debug("prepared another chunk of %i documents (qsize=%s)", len(wrapped_chunk[0]), qsize)
|
||
self.q.put(wrapped_chunk.pop(), block=True)
|
||
|
||
|
||
if os.name == 'nt':
|
||
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
|
||
|
||
def chunkize(corpus, chunksize, maxsize=0, as_numpy=False):
|
||
"""Split `corpus` into fixed-sized chunks, using :func:`~gensim.utils.chunkize_serial`.
|
||
|
||
Parameters
|
||
----------
|
||
corpus : iterable of object
|
||
An iterable.
|
||
chunksize : int
|
||
Split `corpus` into chunks of this size.
|
||
maxsize : int, optional
|
||
Ignored. For interface compatibility only.
|
||
as_numpy : bool, optional
|
||
Yield chunks as `np.ndarray`s instead of lists?
|
||
|
||
Yields
|
||
------
|
||
list OR np.ndarray
|
||
"chunksize"-ed chunks of elements from `corpus`.
|
||
|
||
"""
|
||
for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy):
|
||
yield chunk
|
||
else:
|
||
def chunkize(corpus, chunksize, maxsize=0, as_numpy=False):
|
||
"""Split `corpus` into fixed-sized chunks, using :func:`~gensim.utils.chunkize_serial`.
|
||
|
||
Parameters
|
||
----------
|
||
corpus : iterable of object
|
||
An iterable.
|
||
chunksize : int
|
||
Split `corpus` into chunks of this size.
|
||
maxsize : int, optional
|
||
If > 0, prepare chunks in a background process, filling a chunk queue of size at most `maxsize`.
|
||
as_numpy : bool, optional
|
||
Yield chunks as `np.ndarray` instead of lists?
|
||
|
||
Yields
|
||
------
|
||
list OR np.ndarray
|
||
"chunksize"-ed chunks of elements from `corpus`.
|
||
|
||
Notes
|
||
-----
|
||
Each chunk is of length `chunksize`, except the last one which may be smaller.
|
||
A once-only input stream (`corpus` from a generator) is ok, chunking is done efficiently via itertools.
|
||
|
||
If `maxsize > 0`, don't wait idly in between successive chunk `yields`, but rather keep filling a short queue
|
||
(of size at most `maxsize`) with forthcoming chunks in advance. This is realized by starting a separate process,
|
||
and is meant to reduce I/O delays, which can be significant when `corpus` comes from a slow medium
|
||
like HDD, database or network.
|
||
|
||
If `maxsize == 0`, don't fool around with parallelism and simply yield the chunksize
|
||
via :func:`~gensim.utils.chunkize_serial` (no I/O optimizations).
|
||
|
||
Yields
|
||
------
|
||
list of object OR np.ndarray
|
||
Groups based on `iterable`
|
||
|
||
"""
|
||
assert chunksize > 0
|
||
|
||
if maxsize > 0:
|
||
q = multiprocessing.Queue(maxsize=maxsize)
|
||
worker = InputQueue(q, corpus, chunksize, maxsize=maxsize, as_numpy=as_numpy)
|
||
worker.daemon = True
|
||
worker.start()
|
||
while True:
|
||
chunk = [q.get(block=True)]
|
||
if chunk[0] is None:
|
||
break
|
||
yield chunk.pop()
|
||
else:
|
||
for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy):
|
||
yield chunk
|
||
|
||
|
||
def smart_extension(fname, ext):
|
||
"""Append a file extension `ext` to `fname`, while keeping compressed extensions like `.bz2` or
|
||
`.gz` (if any) at the end.
|
||
|
||
Parameters
|
||
----------
|
||
fname : str
|
||
Filename or full path.
|
||
ext : str
|
||
Extension to append before any compression extensions.
|
||
|
||
Returns
|
||
-------
|
||
str
|
||
New path to file with `ext` appended.
|
||
|
||
Examples
|
||
--------
|
||
|
||
>>> from gensim.utils import smart_extension
|
||
>>> smart_extension("my_file.pkl.gz", ".vectors")
|
||
'my_file.pkl.vectors.gz'
|
||
|
||
"""
|
||
fname, oext = os.path.splitext(fname)
|
||
if oext.endswith('.bz2'):
|
||
fname = fname + oext[:-4] + ext + '.bz2'
|
||
elif oext.endswith('.gz'):
|
||
fname = fname + oext[:-3] + ext + '.gz'
|
||
else:
|
||
fname = fname + oext + ext
|
||
|
||
return fname
|
||
|
||
|
||
def pickle(obj, fname, protocol=2):
|
||
"""Pickle object `obj` to file `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc.
|
||
|
||
Parameters
|
||
----------
|
||
obj : object
|
||
Any python object.
|
||
fname : str
|
||
Path to pickle file.
|
||
protocol : int, optional
|
||
Pickle protocol number. Default is 2 in order to support compatibility across python 2.x and 3.x.
|
||
|
||
"""
|
||
with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows
|
||
_pickle.dump(obj, fout, protocol=protocol)
|
||
|
||
|
||
def unpickle(fname):
|
||
"""Load object from `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc.
|
||
|
||
Parameters
|
||
----------
|
||
fname : str
|
||
Path to pickle file.
|
||
|
||
Returns
|
||
-------
|
||
object
|
||
Python object loaded from `fname`.
|
||
|
||
"""
|
||
with smart_open(fname, 'rb') as f:
|
||
# Because of loading from S3 load can't be used (missing readline in smart_open)
|
||
if sys.version_info > (3, 0):
|
||
return _pickle.load(f, encoding='latin1')
|
||
else:
|
||
return _pickle.loads(f.read())
|
||
|
||
|
||
def revdict(d):
|
||
"""Reverse a dictionary mapping, i.e. `{1: 2, 3: 4}` -> `{2: 1, 4: 3}`.
|
||
|
||
Parameters
|
||
----------
|
||
d : dict
|
||
Input dictionary.
|
||
|
||
Returns
|
||
-------
|
||
dict
|
||
Reversed dictionary mapping.
|
||
|
||
Notes
|
||
-----
|
||
When two keys map to the same value, only one of them will be kept in the result (which one is kept is arbitrary).
|
||
|
||
Examples
|
||
--------
|
||
>>> from gensim.utils import revdict
|
||
>>> d = {1: 2, 3: 4}
|
||
>>> revdict(d)
|
||
{2: 1, 4: 3}
|
||
|
||
"""
|
||
return {v: k for (k, v) in iteritems(dict(d))}
|
||
|
||
|
||
def deprecated(reason):
|
||
"""Decorator to mark functions as deprecated.
|
||
|
||
Calling a decorated function will result in a warning being emitted, using warnings.warn.
|
||
Adapted from https://stackoverflow.com/a/40301488/8001386.
|
||
|
||
Parameters
|
||
----------
|
||
reason : str
|
||
Reason of deprecation.
|
||
|
||
Returns
|
||
-------
|
||
function
|
||
Decorated function
|
||
|
||
"""
|
||
if isinstance(reason, string_types):
|
||
def decorator(func):
|
||
fmt = "Call to deprecated `{name}` ({reason})."
|
||
|
||
@wraps(func)
|
||
def new_func1(*args, **kwargs):
|
||
warnings.warn(
|
||
fmt.format(name=func.__name__, reason=reason),
|
||
category=DeprecationWarning,
|
||
stacklevel=2
|
||
)
|
||
return func(*args, **kwargs)
|
||
|
||
return new_func1
|
||
return decorator
|
||
|
||
elif inspect.isclass(reason) or inspect.isfunction(reason):
|
||
func = reason
|
||
fmt = "Call to deprecated `{name}`."
|
||
|
||
@wraps(func)
|
||
def new_func2(*args, **kwargs):
|
||
warnings.warn(
|
||
fmt.format(name=func.__name__),
|
||
category=DeprecationWarning,
|
||
stacklevel=2
|
||
)
|
||
return func(*args, **kwargs)
|
||
return new_func2
|
||
|
||
else:
|
||
raise TypeError(repr(type(reason)))
|
||
|
||
|
||
@deprecated("Function will be removed in 4.0.0")
|
||
def toptexts(query, texts, index, n=10):
|
||
"""Debug fnc to help inspect the top `n` most similar documents (according to a similarity index `index`),
|
||
to see if they are actually related to the query.
|
||
|
||
Parameters
|
||
----------
|
||
query : {list of (int, number), numpy.ndarray}
|
||
vector OR BoW (list of tuples)
|
||
texts : str
|
||
object that can return something insightful for each document via `texts[docid]`,
|
||
such as its fulltext or snippet.
|
||
index : any
|
||
A instance from from :mod:`gensim.similarity.docsim`.
|
||
|
||
Return
|
||
------
|
||
list
|
||
a list of 3-tuples (docid, doc's similarity to the query, texts[docid])
|
||
|
||
"""
|
||
sims = index[query] # perform a similarity query against the corpus
|
||
sims = sorted(enumerate(sims), key=lambda item: -item[1])
|
||
|
||
return [(topid, topcosine, texts[topid]) for topid, topcosine in sims[:n]] # only consider top-n most similar docs
|
||
|
||
|
||
def randfname(prefix='gensim'):
|
||
"""Generate a random filename in temp.
|
||
|
||
Parameters
|
||
----------
|
||
prefix : str
|
||
Prefix of filename.
|
||
|
||
Returns
|
||
-------
|
||
str
|
||
Full path in the in system's temporary folder, ending in a random filename.
|
||
|
||
"""
|
||
randpart = hex(random.randint(0, 0xffffff))[2:]
|
||
return os.path.join(tempfile.gettempdir(), prefix + randpart)
|
||
|
||
|
||
@deprecated("Function will be removed in 4.0.0")
|
||
def upload_chunked(server, docs, chunksize=1000, preprocess=None):
|
||
"""Memory-friendly upload of documents to a SimServer (or Pyro SimServer proxy).
|
||
|
||
Notes
|
||
-----
|
||
Use this function to train or index large collections -- avoid sending the
|
||
entire corpus over the wire as a single Pyro in-memory object. The documents
|
||
will be sent in smaller chunks, of `chunksize` documents each.
|
||
|
||
"""
|
||
start = 0
|
||
for chunk in grouper(docs, chunksize):
|
||
end = start + len(chunk)
|
||
logger.info("uploading documents %i-%i", start, end - 1)
|
||
if preprocess is not None:
|
||
pchunk = []
|
||
for doc in chunk:
|
||
doc['tokens'] = preprocess(doc['text'])
|
||
del doc['text']
|
||
pchunk.append(doc)
|
||
chunk = pchunk
|
||
server.buffer(chunk)
|
||
start = end
|
||
|
||
|
||
def getNS(host=None, port=None, broadcast=True, hmac_key=None):
|
||
"""Get a Pyro4 name server proxy.
|
||
|
||
Parameters
|
||
----------
|
||
host : str, optional
|
||
Name server hostname.
|
||
port : int, optional
|
||
Name server port.
|
||
broadcast : bool, optional
|
||
Use broadcast mechanism? (i.e. reach out to all Pyro nodes in the network)
|
||
hmac_key : str, optional
|
||
Private key.
|
||
|
||
Raises
|
||
------
|
||
RuntimeError
|
||
When Pyro name server is not found.
|
||
|
||
Returns
|
||
-------
|
||
:class:`Pyro4.core.Proxy`
|
||
Proxy from Pyro4.
|
||
|
||
"""
|
||
import Pyro4
|
||
try:
|
||
return Pyro4.locateNS(host, port, broadcast, hmac_key)
|
||
except Pyro4.errors.NamingError:
|
||
raise RuntimeError("Pyro name server not found")
|
||
|
||
|
||
def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf=None):
|
||
"""Register an object with the Pyro name server.
|
||
|
||
Start the name server if not running yet and block until the daemon is terminated.
|
||
The object is registered under `name`, or `name`+ some random suffix if `random_suffix` is set.
|
||
|
||
"""
|
||
if ns_conf is None:
|
||
ns_conf = {}
|
||
if random_suffix:
|
||
name += '.' + hex(random.randint(0, 0xffffff))[2:]
|
||
|
||
import Pyro4
|
||
with getNS(**ns_conf) as ns:
|
||
with Pyro4.Daemon(ip or get_my_ip(), port or 0) as daemon:
|
||
# register server for remote access
|
||
uri = daemon.register(obj, name)
|
||
ns.remove(name)
|
||
ns.register(name, uri)
|
||
logger.info("%s registered with nameserver (URI '%s')", name, uri)
|
||
daemon.requestLoop()
|
||
|
||
|
||
def has_pattern():
|
||
"""Check whether the `pattern <https://github.com/clips/pattern>`_ package is installed.
|
||
|
||
Returns
|
||
-------
|
||
bool
|
||
Is `pattern` installed?
|
||
|
||
"""
|
||
try:
|
||
from pattern.en import parse # noqa:F401
|
||
return True
|
||
except ImportError:
|
||
return False
|
||
|
||
|
||
def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False,
|
||
stopwords=frozenset(), min_length=2, max_length=15):
|
||
"""Use the English lemmatizer from `pattern <https://github.com/clips/pattern>`_ to extract UTF8-encoded tokens in
|
||
their base form aka lemma, e.g. "are, is, being" becomes "be" etc.
|
||
|
||
This is a smarter version of stemming, taking word context into account.
|
||
|
||
Parameters
|
||
----------
|
||
content : str
|
||
Input string
|
||
allowed_tags : :class:`_sre.SRE_Pattern`, optional
|
||
Compiled regexp to select POS that will be used.
|
||
Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
|
||
light : bool, optional
|
||
DEPRECATED FLAG, DOESN'T SUPPORT BY `pattern`.
|
||
stopwords : frozenset, optional
|
||
Set of words that will be removed from output.
|
||
min_length : int, optional
|
||
Minimal token length in output (inclusive).
|
||
max_length : int, optional
|
||
Maximal token length in output (inclusive).
|
||
|
||
Returns
|
||
-------
|
||
list of str
|
||
List with tokens with POS tags.
|
||
|
||
Warnings
|
||
--------
|
||
This function is only available when the optional `pattern <https://github.com/clips/pattern>`_ is installed.
|
||
|
||
Raises
|
||
------
|
||
ImportError
|
||
If `pattern <https://github.com/clips/pattern>`_ not installed.
|
||
|
||
Examples
|
||
--------
|
||
>>> from gensim.utils import lemmatize
|
||
>>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
|
||
['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
|
||
|
||
Note the context-dependent part-of-speech tags between these two examples:
|
||
|
||
>>> lemmatize('The study ranks high.')
|
||
['study/NN', 'rank/VB', 'high/JJ']
|
||
|
||
>>> lemmatize('The ranks study hard.')
|
||
['rank/NN', 'study/VB', 'hard/RB']
|
||
|
||
"""
|
||
if not has_pattern():
|
||
raise ImportError(
|
||
"Pattern library is not installed. Pattern library is needed in order to use lemmatize function"
|
||
)
|
||
from pattern.en import parse
|
||
|
||
if light:
|
||
import warnings
|
||
warnings.warn("The light flag is no longer supported by pattern.")
|
||
|
||
# tokenization in `pattern` is weird; it gets thrown off by non-letters,
|
||
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
|
||
# FIXME this throws away all fancy parsing cues, including sentence structure,
|
||
# abbreviations etc.
|
||
content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
|
||
|
||
parsed = parse(content, lemmata=True, collapse=False)
|
||
result = []
|
||
for sentence in parsed:
|
||
for token, tag, _, _, lemma in sentence:
|
||
if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords:
|
||
if allowed_tags.match(tag):
|
||
lemma += "/" + tag[:2]
|
||
result.append(lemma.encode('utf8'))
|
||
return result
|
||
|
||
|
||
def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0):
|
||
"""Create a random gensim BoW vector, with the feature counts following the Poisson distribution.
|
||
|
||
Parameters
|
||
----------
|
||
dim : int, optional
|
||
Dimension of vector.
|
||
prob_nnz : float, optional
|
||
Probability of each coordinate will be nonzero, will be drawn from the Poisson distribution.
|
||
lam : float, optional
|
||
Lambda parameter for the Poisson distribution.
|
||
|
||
Returns
|
||
-------
|
||
list of (int, float)
|
||
Vector in BoW format.
|
||
|
||
"""
|
||
nnz = np.random.uniform(size=(dim,))
|
||
return [(i, float(np.random.poisson(lam=lam) + 1.0)) for i in xrange(dim) if nnz[i] < prob_nnz]
|
||
|
||
|
||
def mock_data(n_items=1000, dim=1000, prob_nnz=0.5, lam=1.0):
|
||
"""Create a random Gensim-style corpus (BoW), using :func:`~gensim.utils.mock_data_row`.
|
||
|
||
Parameters
|
||
----------
|
||
n_items : int
|
||
Size of corpus
|
||
dim : int
|
||
Dimension of vector, used for :func:`~gensim.utils.mock_data_row`.
|
||
prob_nnz : float, optional
|
||
Probability of each coordinate will be nonzero, will be drawn from Poisson distribution,
|
||
used for :func:`~gensim.utils.mock_data_row`.
|
||
lam : float, optional
|
||
Parameter for Poisson distribution, used for :func:`~gensim.utils.mock_data_row`.
|
||
|
||
Returns
|
||
-------
|
||
list of list of (int, float)
|
||
Gensim-style corpus.
|
||
|
||
"""
|
||
return [mock_data_row(dim=dim, prob_nnz=prob_nnz, lam=lam) for _ in xrange(n_items)]
|
||
|
||
|
||
def prune_vocab(vocab, min_reduce, trim_rule=None):
|
||
"""Remove all entries from the `vocab` dictionary with count smaller than `min_reduce`.
|
||
|
||
Modifies `vocab` in place, returns the sum of all counts that were pruned.
|
||
|
||
Parameters
|
||
----------
|
||
vocab : dict
|
||
Input dictionary.
|
||
min_reduce : int
|
||
Frequency threshold for tokens in `vocab`.
|
||
trim_rule : function, optional
|
||
Function for trimming entities from vocab, default behaviour is `vocab[w] <= min_reduce`.
|
||
|
||
Returns
|
||
-------
|
||
result : int
|
||
Sum of all counts that were pruned.
|
||
|
||
"""
|
||
result = 0
|
||
old_len = len(vocab)
|
||
for w in list(vocab): # make a copy of dict's keys
|
||
if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule): # vocab[w] <= min_reduce:
|
||
result += vocab[w]
|
||
del vocab[w]
|
||
logger.info(
|
||
"pruned out %i tokens with count <=%i (before %i, after %i)",
|
||
old_len - len(vocab), min_reduce, old_len, len(vocab)
|
||
)
|
||
return result
|
||
|
||
|
||
def trim_vocab_by_freq(vocab, topk, trim_rule=None):
|
||
"""Retain `topk` most frequent words in `vocab`.
|
||
If there are more words with the same frequency as `topk`-th one, they will be kept.
|
||
Modifies `vocab` in place, returns nothing.
|
||
|
||
Parameters
|
||
----------
|
||
vocab : dict
|
||
Input dictionary.
|
||
topk : int
|
||
Number of words with highest frequencies to keep.
|
||
trim_rule : function, optional
|
||
Function for trimming entities from vocab, default behaviour is `vocab[w] <= min_count`.
|
||
|
||
"""
|
||
if topk >= len(vocab):
|
||
return
|
||
|
||
min_count = heapq.nlargest(topk, itervalues(vocab))[-1]
|
||
prune_vocab(vocab, min_count, trim_rule=trim_rule)
|
||
|
||
|
||
def merge_counts(dict1, dict2):
|
||
"""Merge `dict1` of (word, freq1) and `dict2` of (word, freq2) into `dict1` of (word, freq1+freq2).
|
||
Parameters
|
||
----------
|
||
dict1 : dict of (str, int)
|
||
First dictionary.
|
||
dict2 : dict of (str, int)
|
||
Second dictionary.
|
||
Returns
|
||
-------
|
||
result : dict
|
||
Merged dictionary with sum of frequencies as values.
|
||
"""
|
||
for word, freq in iteritems(dict2):
|
||
if word in dict1:
|
||
dict1[word] += freq
|
||
else:
|
||
dict1[word] = freq
|
||
|
||
return dict1
|
||
|
||
|
||
def qsize(queue):
|
||
"""Get the (approximate) queue size where available.
|
||
|
||
Parameters
|
||
----------
|
||
queue : :class:`queue.Queue`
|
||
Input queue.
|
||
|
||
Returns
|
||
-------
|
||
int
|
||
Queue size, -1 if `qsize` method isn't implemented (OS X).
|
||
|
||
"""
|
||
try:
|
||
return queue.qsize()
|
||
except NotImplementedError:
|
||
# OS X doesn't support qsize
|
||
return -1
|
||
|
||
|
||
RULE_DEFAULT = 0
|
||
RULE_DISCARD = 1
|
||
RULE_KEEP = 2
|
||
|
||
|
||
def keep_vocab_item(word, count, min_count, trim_rule=None):
|
||
"""Should we keep `word` in the vocab or remove it?
|
||
|
||
Parameters
|
||
----------
|
||
word : str
|
||
Input word.
|
||
count : int
|
||
Number of times that word appeared in a corpus.
|
||
min_count : int
|
||
Discard words with frequency smaller than this.
|
||
trim_rule : function, optional
|
||
Custom function to decide whether to keep or discard this word.
|
||
If a custom `trim_rule` is not specified, the default behaviour is simply `count >= min_count`.
|
||
|
||
Returns
|
||
-------
|
||
bool
|
||
True if `word` should stay, False otherwise.
|
||
|
||
"""
|
||
default_res = count >= min_count
|
||
|
||
if trim_rule is None:
|
||
return default_res
|
||
else:
|
||
rule_res = trim_rule(word, count, min_count)
|
||
if rule_res == RULE_KEEP:
|
||
return True
|
||
elif rule_res == RULE_DISCARD:
|
||
return False
|
||
else:
|
||
return default_res
|
||
|
||
|
||
def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs):
|
||
r"""Run OS command with the given arguments and return its output as a byte string.
|
||
|
||
Backported from Python 2.7 with a few minor modifications. Widely used for :mod:`gensim.models.wrappers`.
|
||
Behaves very similar to https://docs.python.org/2/library/subprocess.html#subprocess.check_output.
|
||
|
||
Examples
|
||
--------
|
||
>>> from gensim.utils import check_output
|
||
>>> check_output(args=['echo', '1'])
|
||
'1\n'
|
||
|
||
Raises
|
||
------
|
||
KeyboardInterrupt
|
||
If Ctrl+C pressed.
|
||
|
||
"""
|
||
try:
|
||
logger.debug("COMMAND: %s %s", popenargs, kwargs)
|
||
process = subprocess.Popen(stdout=stdout, *popenargs, **kwargs)
|
||
output, unused_err = process.communicate()
|
||
retcode = process.poll()
|
||
if retcode:
|
||
cmd = kwargs.get("args")
|
||
if cmd is None:
|
||
cmd = popenargs[0]
|
||
error = subprocess.CalledProcessError(retcode, cmd)
|
||
error.output = output
|
||
raise error
|
||
return output
|
||
except KeyboardInterrupt:
|
||
process.terminate()
|
||
raise
|
||
|
||
|
||
def sample_dict(d, n=10, use_random=True):
|
||
"""Selected `n` (possibly random) items from the dictionary `d`.
|
||
|
||
Parameters
|
||
----------
|
||
d : dict
|
||
Input dictionary.
|
||
n : int, optional
|
||
Number of items to select.
|
||
use_random : bool, optional
|
||
Select items randomly (without replacement), instead of by the natural dict iteration order?
|
||
|
||
Returns
|
||
-------
|
||
list of (object, object)
|
||
Selected items from dictionary, as a list.
|
||
|
||
"""
|
||
selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(iterkeys(d), n)
|
||
return [(key, d[key]) for key in selected_keys]
|
||
|
||
|
||
def strided_windows(ndarray, window_size):
|
||
"""Produce a numpy.ndarray of windows, as from a sliding window.
|
||
|
||
Parameters
|
||
----------
|
||
ndarray : numpy.ndarray
|
||
Input array
|
||
window_size : int
|
||
Sliding window size.
|
||
|
||
Returns
|
||
-------
|
||
numpy.ndarray
|
||
Subsequences produced by sliding a window of the given size over the `ndarray`.
|
||
Since this uses striding, the individual arrays are views rather than copies of `ndarray`.
|
||
Changes to one view modifies the others and the original.
|
||
|
||
Examples
|
||
--------
|
||
>>> from gensim.utils import strided_windows
|
||
>>> strided_windows(np.arange(5), 2)
|
||
array([[0, 1],
|
||
[1, 2],
|
||
[2, 3],
|
||
[3, 4]])
|
||
>>> strided_windows(np.arange(10), 5)
|
||
array([[0, 1, 2, 3, 4],
|
||
[1, 2, 3, 4, 5],
|
||
[2, 3, 4, 5, 6],
|
||
[3, 4, 5, 6, 7],
|
||
[4, 5, 6, 7, 8],
|
||
[5, 6, 7, 8, 9]])
|
||
|
||
"""
|
||
ndarray = np.asarray(ndarray)
|
||
if window_size == ndarray.shape[0]:
|
||
return np.array([ndarray])
|
||
elif window_size > ndarray.shape[0]:
|
||
return np.ndarray((0, 0))
|
||
|
||
stride = ndarray.strides[0]
|
||
return np.lib.stride_tricks.as_strided(
|
||
ndarray, shape=(ndarray.shape[0] - window_size + 1, window_size),
|
||
strides=(stride, stride))
|
||
|
||
|
||
def iter_windows(texts, window_size, copy=False, ignore_below_size=True, include_doc_num=False):
|
||
"""Produce a generator over the given texts using a sliding window of `window_size`.
|
||
|
||
The windows produced are views of some subsequence of a text.
|
||
To use deep copies instead, pass `copy=True`.
|
||
|
||
Parameters
|
||
----------
|
||
texts : list of str
|
||
List of string sentences.
|
||
window_size : int
|
||
Size of sliding window.
|
||
copy : bool, optional
|
||
Produce deep copies.
|
||
ignore_below_size : bool, optional
|
||
Ignore documents that are not at least `window_size` in length?
|
||
include_doc_num : bool, optional
|
||
Yield the text position with `texts` along with each window?
|
||
|
||
"""
|
||
for doc_num, document in enumerate(texts):
|
||
for window in _iter_windows(document, window_size, copy, ignore_below_size):
|
||
if include_doc_num:
|
||
yield (doc_num, window)
|
||
else:
|
||
yield window
|
||
|
||
|
||
def _iter_windows(document, window_size, copy=False, ignore_below_size=True):
|
||
doc_windows = strided_windows(document, window_size)
|
||
if doc_windows.shape[0] == 0:
|
||
if not ignore_below_size:
|
||
yield document.copy() if copy else document
|
||
else:
|
||
for doc_window in doc_windows:
|
||
yield doc_window.copy() if copy else doc_window
|
||
|
||
|
||
def flatten(nested_list):
|
||
"""Recursively flatten a nested sequence of elements.
|
||
|
||
Parameters
|
||
----------
|
||
nested_list : iterable
|
||
Possibly nested sequence of elements to flatten.
|
||
|
||
Returns
|
||
-------
|
||
list
|
||
Flattened version of `nested_list` where any elements that are an iterable (`collections.Iterable`)
|
||
have been unpacked into the top-level list, in a recursive fashion.
|
||
|
||
"""
|
||
return list(lazy_flatten(nested_list))
|
||
|
||
|
||
def lazy_flatten(nested_list):
|
||
"""Lazy version of :func:`~gensim.utils.flatten`.
|
||
|
||
Parameters
|
||
----------
|
||
nested_list : list
|
||
Possibly nested list.
|
||
|
||
Yields
|
||
------
|
||
object
|
||
Element of list
|
||
|
||
"""
|
||
for el in nested_list:
|
||
if isinstance(el, collections.Iterable) and not isinstance(el, string_types):
|
||
for sub in flatten(el):
|
||
yield sub
|
||
else:
|
||
yield el
|
||
|
||
|
||
def save_as_line_sentence(corpus, filename):
|
||
"""Save the corpus in LineSentence format, i.e. each sentence on a separate line,
|
||
tokens are separated by space.
|
||
|
||
Parameters
|
||
----------
|
||
corpus : iterable of iterables of strings
|
||
|
||
"""
|
||
with smart_open(filename, mode='wb', encoding='utf8') as fout:
|
||
for sentence in corpus:
|
||
line = any2unicode(' '.join(sentence) + '\n')
|
||
fout.write(line)
|
||
|
||
|
||
def effective_n_jobs(n_jobs):
|
||
"""Determines the number of jobs can run in parallel.
|
||
|
||
Just like in sklearn, passing n_jobs=-1 means using all available
|
||
CPU cores.
|
||
|
||
Parameters
|
||
----------
|
||
n_jobs : int
|
||
Number of workers requested by caller.
|
||
|
||
Returns
|
||
-------
|
||
int
|
||
Number of effective jobs.
|
||
|
||
"""
|
||
if n_jobs == 0:
|
||
raise ValueError('n_jobs == 0 in Parallel has no meaning')
|
||
elif n_jobs is None:
|
||
return 1
|
||
elif n_jobs < 0:
|
||
n_jobs = max(cpu_count() + 1 + n_jobs, 1)
|
||
return n_jobs
|