#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
|
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
|
|
|
"""Various general utility functions."""
|
|
|
|
from __future__ import with_statement
|
|
from contextlib import contextmanager
|
|
import collections
|
|
import logging
|
|
import warnings
|
|
|
|
try:
|
|
from html.entities import name2codepoint as n2cp
|
|
except ImportError:
|
|
from htmlentitydefs import name2codepoint as n2cp
|
|
try:
|
|
import cPickle as _pickle
|
|
except ImportError:
|
|
import pickle as _pickle
|
|
|
|
import re
|
|
import unicodedata
|
|
import os
|
|
import random
|
|
import itertools
|
|
import tempfile
|
|
from functools import wraps
|
|
import multiprocessing
|
|
import shutil
|
|
import sys
|
|
import subprocess
|
|
import inspect
|
|
import heapq
|
|
|
|
import numpy as np
|
|
import numbers
|
|
import scipy.sparse
|
|
|
|
from six import iterkeys, iteritems, itervalues, u, string_types, unichr
|
|
from six.moves import xrange
|
|
|
|
from smart_open import smart_open
|
|
|
|
from multiprocessing import cpu_count
|
|
|
|
if sys.version_info[0] >= 3:
|
|
unicode = str
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE)
|
|
RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)
|
|
|
|
|
|
def get_random_state(seed):
|
|
"""Generate :class:`numpy.random.RandomState` based on input seed.
|
|
|
|
Parameters
|
|
----------
|
|
seed : {None, int, array_like}
|
|
Seed for random state.
|
|
|
|
Returns
|
|
-------
|
|
:class:`numpy.random.RandomState`
|
|
Random state.
|
|
|
|
Raises
|
|
------
|
|
AttributeError
|
|
If seed is not {None, int, array_like}.
|
|
|
|
Notes
|
|
-----
|
|
Method originally from `maciejkula/glove-python <https://github.com/maciejkula/glove-python>`_
|
|
and written by `@joshloyal <https://github.com/joshloyal>`_.
|
|
|
|
"""
|
|
if seed is None or seed is np.random:
|
|
return np.random.mtrand._rand
|
|
if isinstance(seed, (numbers.Integral, np.integer)):
|
|
return np.random.RandomState(seed)
|
|
if isinstance(seed, np.random.RandomState):
|
|
return seed
|
|
raise ValueError('%r cannot be used to seed a np.random.RandomState instance' % seed)
|
|
|
|
|
|
def synchronous(tlockname):
|
|
"""A decorator to place an instance-based lock around a method.
|
|
|
|
Notes
|
|
-----
|
|
Adapted from http://code.activestate.com/recipes/577105-synchronization-decorator-for-class-methods/.
|
|
|
|
"""
|
|
def _synched(func):
|
|
@wraps(func)
|
|
def _synchronizer(self, *args, **kwargs):
|
|
tlock = getattr(self, tlockname)
|
|
logger.debug("acquiring lock %r for %s", tlockname, func.__name__)
|
|
|
|
with tlock: # use lock as a context manager to perform safe acquire/release pairs
|
|
logger.debug("acquired lock %r for %s", tlockname, func.__name__)
|
|
result = func(self, *args, **kwargs)
|
|
logger.debug("releasing lock %r for %s", tlockname, func.__name__)
|
|
return result
|
|
return _synchronizer
|
|
return _synched
|
|
|
|
|
|
def file_or_filename(input):
|
|
"""Open a filename for reading with `smart_open`, or seek to the beginning if `input` is an already open file.
|
|
|
|
Parameters
|
|
----------
|
|
input : str or file-like
|
|
Filename or file-like object.
|
|
|
|
Returns
|
|
-------
|
|
file-like object
|
|
An open file, positioned at the beginning.
|
|
|
|
"""
|
|
if isinstance(input, string_types):
|
|
# input was a filename: open as file
|
|
return smart_open(input)
|
|
else:
|
|
# input already a file-like object; just reset to the beginning
|
|
input.seek(0)
|
|
return input
|
|
|
|
|
|
@contextmanager
|
|
def open_file(input):
|
|
"""Provide "with-like" behaviour without closing the file object.
|
|
|
|
Parameters
|
|
----------
|
|
input : str or file-like
|
|
Filename or file-like object.
|
|
|
|
Yields
|
|
-------
|
|
file
|
|
File-like object based on input (or input if this already file-like).
|
|
|
|
"""
|
|
mgr = file_or_filename(input)
|
|
exc = False
|
|
try:
|
|
yield mgr
|
|
except Exception:
|
|
# Handling any unhandled exceptions from the code nested in 'with' statement.
|
|
exc = True
|
|
if not isinstance(input, string_types) or not mgr.__exit__(*sys.exc_info()):
|
|
raise
|
|
# Try to introspect and silence errors.
|
|
finally:
|
|
if not exc and isinstance(input, string_types):
|
|
mgr.__exit__(None, None, None)
|
|
|
|
|
|
def deaccent(text):
|
|
"""Remove letter accents from the given string.
|
|
|
|
Parameters
|
|
----------
|
|
text : str
|
|
Input string.
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
Unicode string without accents.
|
|
|
|
Examples
|
|
--------
|
|
>>> from gensim.utils import deaccent
|
|
>>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
|
|
u'Sef chomutovskych komunistu dostal postou bily prasek'
|
|
|
|
"""
|
|
if not isinstance(text, unicode):
|
|
# assume utf8 for byte strings, use default (strict) error handling
|
|
text = text.decode('utf8')
|
|
norm = unicodedata.normalize("NFD", text)
|
|
result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
|
|
return unicodedata.normalize("NFC", result)
|
|
|
|
|
|
def copytree_hardlink(source, dest):
|
|
"""Recursively copy a directory ala shutils.copytree, but hardlink files instead of copying.
|
|
|
|
Parameters
|
|
----------
|
|
source : str
|
|
Path to source directory
|
|
dest : str
|
|
Path to destination directory
|
|
|
|
Warnings
|
|
--------
|
|
Available on UNIX systems only.
|
|
|
|
"""
|
|
copy2 = shutil.copy2
|
|
try:
|
|
shutil.copy2 = os.link
|
|
shutil.copytree(source, dest)
|
|
finally:
|
|
shutil.copy2 = copy2
|
|
|
|
|
|
def tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict", to_lower=False, lower=False):
|
|
"""Iteratively yield tokens as unicode strings, optionally removing accent marks and lowercasing it.
|
|
|
|
Parameters
|
|
----------
|
|
text : str or bytes
|
|
Input string.
|
|
deacc : bool, optional
|
|
Remove accentuation using :func:`~gensim.utils.deaccent`?
|
|
encoding : str, optional
|
|
Encoding of input string, used as parameter for :func:`~gensim.utils.to_unicode`.
|
|
errors : str, optional
|
|
Error handling behaviour, used as parameter for :func:`~gensim.utils.to_unicode`.
|
|
lowercase : bool, optional
|
|
Lowercase the input string?
|
|
to_lower : bool, optional
|
|
Same as `lowercase`. Convenience alias.
|
|
lower : bool, optional
|
|
Same as `lowercase`. Convenience alias.
|
|
|
|
Yields
|
|
------
|
|
str
|
|
Contiguous sequences of alphabetic characters (no digits!), using :func:`~gensim.utils.simple_tokenize`
|
|
|
|
Examples
|
|
--------
|
|
>>> from gensim.utils import tokenize
|
|
>>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc=True))
|
|
[u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']
|
|
|
|
"""
|
|
lowercase = lowercase or to_lower or lower
|
|
text = to_unicode(text, encoding, errors=errors)
|
|
if lowercase:
|
|
text = text.lower()
|
|
if deacc:
|
|
text = deaccent(text)
|
|
return simple_tokenize(text)
|
|
|
|
|
|
def simple_tokenize(text):
|
|
"""Tokenize input test using :const:`gensim.utils.PAT_ALPHABETIC`.
|
|
|
|
Parameters
|
|
----------
|
|
text : str
|
|
Input text.
|
|
|
|
Yields
|
|
------
|
|
str
|
|
Tokens from `text`.
|
|
|
|
"""
|
|
for match in PAT_ALPHABETIC.finditer(text):
|
|
yield match.group()
|
|
|
|
|
|
def simple_preprocess(doc, deacc=False, min_len=2, max_len=15):
|
|
"""Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
|
|
|
|
Uses :func:`~gensim.utils.tokenize` internally.
|
|
|
|
Parameters
|
|
----------
|
|
doc : str
|
|
Input document.
|
|
deacc : bool, optional
|
|
Remove accent marks from tokens using :func:`~gensim.utils.deaccent`?
|
|
min_len : int, optional
|
|
Minimum length of token (inclusive). Shorter tokens are discarded.
|
|
max_len : int, optional
|
|
Maximum length of token in result (inclusive). Longer tokens are discarded.
|
|
|
|
Returns
|
|
-------
|
|
list of str
|
|
Tokens extracted from `doc`.
|
|
|
|
"""
|
|
tokens = [
|
|
token for token in tokenize(doc, lower=True, deacc=deacc, errors='ignore')
|
|
if min_len <= len(token) <= max_len and not token.startswith('_')
|
|
]
|
|
return tokens
|
|
|
|
|
|
def any2utf8(text, errors='strict', encoding='utf8'):
|
|
"""Convert a unicode or bytes string in the given encoding into a utf8 bytestring.
|
|
|
|
Parameters
|
|
----------
|
|
text : str
|
|
Input text.
|
|
errors : str, optional
|
|
Error handling behaviour if `text` is a bytestring.
|
|
encoding : str, optional
|
|
Encoding of `text` if it is a bytestring.
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
Bytestring in utf8.
|
|
|
|
"""
|
|
|
|
if isinstance(text, unicode):
|
|
return text.encode('utf8')
|
|
# do bytestring -> unicode -> utf8 full circle, to ensure valid utf8
|
|
return unicode(text, encoding, errors=errors).encode('utf8')
|
|
|
|
|
|
to_utf8 = any2utf8
|
|
|
|
|
|
def any2unicode(text, encoding='utf8', errors='strict'):
|
|
"""Convert `text` (bytestring in given encoding or unicode) to unicode.
|
|
|
|
Parameters
|
|
----------
|
|
text : str
|
|
Input text.
|
|
errors : str, optional
|
|
Error handling behaviour if `text` is a bytestring.
|
|
encoding : str, optional
|
|
Encoding of `text` if it is a bytestring.
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
Unicode version of `text`.
|
|
|
|
"""
|
|
if isinstance(text, unicode):
|
|
return text
|
|
return unicode(text, encoding, errors=errors)
|
|
|
|
|
|
to_unicode = any2unicode
|
|
|
|
|
|
def call_on_class_only(*args, **kwargs):
|
|
"""Helper to raise `AttributeError` if a class method is called on an instance. Used internally.
|
|
|
|
Parameters
|
|
----------
|
|
*args
|
|
Variable length argument list.
|
|
**kwargs
|
|
Arbitrary keyword arguments.
|
|
|
|
Raises
|
|
------
|
|
AttributeError
|
|
If a class method is called on an instance.
|
|
|
|
"""
|
|
raise AttributeError('This method should be called on a class object.')
|
|
|
|
|
|
class SaveLoad(object):
|
|
"""Serialize/deserialize object from disk, by equipping objects with the save()/load() methods.
|
|
|
|
Warnings
|
|
--------
|
|
This uses pickle internally (among other techniques), so objects must not contain unpicklable attributes
|
|
such as lambda functions etc.
|
|
|
|
"""
|
|
@classmethod
|
|
def load(cls, fname, mmap=None):
|
|
"""Load an object previously saved using :meth:`~gensim.utils.SaveLoad.save` from a file.
|
|
|
|
Parameters
|
|
----------
|
|
fname : str
|
|
Path to file that contains needed object.
|
|
mmap : str, optional
|
|
Memory-map option. If the object was saved with large arrays stored separately, you can load these arrays
|
|
via mmap (shared memory) using `mmap='r'.
|
|
If the file being loaded is compressed (either '.gz' or '.bz2'), then `mmap=None` **must be** set.
|
|
|
|
See Also
|
|
--------
|
|
:meth:`~gensim.utils.SaveLoad.save`
|
|
Save object to file.
|
|
|
|
Returns
|
|
-------
|
|
object
|
|
Object loaded from `fname`.
|
|
|
|
Raises
|
|
------
|
|
AttributeError
|
|
When called on an object instance instead of class (this is a class method).
|
|
|
|
"""
|
|
logger.info("loading %s object from %s", cls.__name__, fname)
|
|
|
|
compress, subname = SaveLoad._adapt_by_suffix(fname)
|
|
|
|
obj = unpickle(fname)
|
|
obj._load_specials(fname, mmap, compress, subname)
|
|
logger.info("loaded %s", fname)
|
|
return obj
|
|
|
|
def _load_specials(self, fname, mmap, compress, subname):
|
|
"""Load attributes that were stored separately, and give them the same opportunity
|
|
to recursively load using the :class:`~gensim.utils.SaveLoad` interface.
|
|
|
|
Parameters
|
|
----------
|
|
fname : str
|
|
Input file path.
|
|
mmap : {None, ‘r+’, ‘r’, ‘w+’, ‘c’}
|
|
Memory-map options. See `numpy.load(mmap_mode)
|
|
<https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.load.html>`_.
|
|
compress : bool
|
|
Is the input file compressed?
|
|
subname : str
|
|
Attribute name. Set automatically during recursive processing.
|
|
|
|
"""
|
|
def mmap_error(obj, filename):
|
|
return IOError(
|
|
'Cannot mmap compressed object %s in file %s. ' % (obj, filename) +
|
|
'Use `load(fname, mmap=None)` or uncompress files manually.'
|
|
)
|
|
|
|
for attrib in getattr(self, '__recursive_saveloads', []):
|
|
cfname = '.'.join((fname, attrib))
|
|
logger.info("loading %s recursively from %s.* with mmap=%s", attrib, cfname, mmap)
|
|
getattr(self, attrib)._load_specials(cfname, mmap, compress, subname)
|
|
|
|
for attrib in getattr(self, '__numpys', []):
|
|
logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap)
|
|
|
|
if compress:
|
|
if mmap:
|
|
raise mmap_error(attrib, subname(fname, attrib))
|
|
|
|
val = np.load(subname(fname, attrib))['val']
|
|
else:
|
|
val = np.load(subname(fname, attrib), mmap_mode=mmap)
|
|
|
|
setattr(self, attrib, val)
|
|
|
|
for attrib in getattr(self, '__scipys', []):
|
|
logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap)
|
|
sparse = unpickle(subname(fname, attrib))
|
|
if compress:
|
|
if mmap:
|
|
raise mmap_error(attrib, subname(fname, attrib))
|
|
|
|
with np.load(subname(fname, attrib, 'sparse')) as f:
|
|
sparse.data = f['data']
|
|
sparse.indptr = f['indptr']
|
|
sparse.indices = f['indices']
|
|
else:
|
|
sparse.data = np.load(subname(fname, attrib, 'data'), mmap_mode=mmap)
|
|
sparse.indptr = np.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap)
|
|
sparse.indices = np.load(subname(fname, attrib, 'indices'), mmap_mode=mmap)
|
|
|
|
setattr(self, attrib, sparse)
|
|
|
|
for attrib in getattr(self, '__ignoreds', []):
|
|
logger.info("setting ignored attribute %s to None", attrib)
|
|
setattr(self, attrib, None)
|
|
|
|
@staticmethod
|
|
def _adapt_by_suffix(fname):
|
|
"""Get compress setting and filename for numpy file compression.
|
|
|
|
Parameters
|
|
----------
|
|
fname : str
|
|
Input filename.
|
|
|
|
Returns
|
|
-------
|
|
(bool, function)
|
|
First argument will be True if `fname` compressed.
|
|
|
|
"""
|
|
compress, suffix = (True, 'npz') if fname.endswith('.gz') or fname.endswith('.bz2') else (False, 'npy')
|
|
return compress, lambda *args: '.'.join(args + (suffix,))
|
|
|
|
def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2):
|
|
"""Save the object to a file. Used internally by :meth:`gensim.utils.SaveLoad.save()`.
|
|
|
|
Parameters
|
|
----------
|
|
fname : str
|
|
Path to file.
|
|
separately : list, optional
|
|
Iterable of attributes than need to store distinctly.
|
|
sep_limit : int, optional
|
|
Limit for separation.
|
|
ignore : frozenset, optional
|
|
Attributes that shouldn't be store.
|
|
pickle_protocol : int, optional
|
|
Protocol number for pickle.
|
|
|
|
Notes
|
|
-----
|
|
If `separately` is None, automatically detect large numpy/scipy.sparse arrays in the object being stored,
|
|
and store them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays back
|
|
on load efficiently.
|
|
|
|
You can also set `separately` manually, in which case it must be a list of attribute names to be stored
|
|
in separate files. The automatic check is not performed in this case.
|
|
|
|
"""
|
|
logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately)
|
|
|
|
compress, subname = SaveLoad._adapt_by_suffix(fname)
|
|
|
|
restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol,
|
|
compress, subname)
|
|
try:
|
|
pickle(self, fname, protocol=pickle_protocol)
|
|
finally:
|
|
# restore attribs handled specially
|
|
for obj, asides in restores:
|
|
for attrib, val in iteritems(asides):
|
|
setattr(obj, attrib, val)
|
|
logger.info("saved %s", fname)
|
|
|
|
def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname):
|
|
"""Save aside any attributes that need to be handled separately, including
|
|
by recursion any attributes that are themselves :class:`~gensim.utils.SaveLoad` instances.
|
|
|
|
Parameters
|
|
----------
|
|
fname : str
|
|
Output filename.
|
|
separately : list or None
|
|
List of attributes to store separately.
|
|
sep_limit : int
|
|
Don't store arrays smaller than this separately. In bytes.
|
|
ignore : iterable of str
|
|
Attributes that shouldn't be stored at all.
|
|
pickle_protocol : int
|
|
Protocol number for pickle.
|
|
compress : bool
|
|
If True - compress output with :func:`numpy.savez_compressed`.
|
|
subname : function
|
|
Produced by :meth:`~gensim.utils.SaveLoad._adapt_by_suffix`
|
|
|
|
Returns
|
|
-------
|
|
list of (obj, {attrib: value, ...})
|
|
Settings that the caller should use to restore each object's attributes that were set aside
|
|
during the default :func:`~gensim.utils.pickle`.
|
|
|
|
"""
|
|
asides = {}
|
|
sparse_matrices = (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)
|
|
if separately is None:
|
|
separately = []
|
|
for attrib, val in iteritems(self.__dict__):
|
|
if isinstance(val, np.ndarray) and val.size >= sep_limit:
|
|
separately.append(attrib)
|
|
elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit:
|
|
separately.append(attrib)
|
|
|
|
# whatever's in `separately` or `ignore` at this point won't get pickled
|
|
for attrib in separately + list(ignore):
|
|
if hasattr(self, attrib):
|
|
asides[attrib] = getattr(self, attrib)
|
|
delattr(self, attrib)
|
|
|
|
recursive_saveloads = []
|
|
restores = []
|
|
for attrib, val in iteritems(self.__dict__):
|
|
if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading
|
|
recursive_saveloads.append(attrib)
|
|
cfname = '.'.join((fname, attrib))
|
|
restores.extend(val._save_specials(cfname, None, sep_limit, ignore, pickle_protocol, compress, subname))
|
|
|
|
try:
|
|
numpys, scipys, ignoreds = [], [], []
|
|
for attrib, val in iteritems(asides):
|
|
if isinstance(val, np.ndarray) and attrib not in ignore:
|
|
numpys.append(attrib)
|
|
logger.info("storing np array '%s' to %s", attrib, subname(fname, attrib))
|
|
|
|
if compress:
|
|
np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val))
|
|
else:
|
|
np.save(subname(fname, attrib), np.ascontiguousarray(val))
|
|
|
|
elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore:
|
|
scipys.append(attrib)
|
|
logger.info("storing scipy.sparse array '%s' under %s", attrib, subname(fname, attrib))
|
|
|
|
if compress:
|
|
np.savez_compressed(
|
|
subname(fname, attrib, 'sparse'),
|
|
data=val.data,
|
|
indptr=val.indptr,
|
|
indices=val.indices
|
|
)
|
|
else:
|
|
np.save(subname(fname, attrib, 'data'), val.data)
|
|
np.save(subname(fname, attrib, 'indptr'), val.indptr)
|
|
np.save(subname(fname, attrib, 'indices'), val.indices)
|
|
|
|
data, indptr, indices = val.data, val.indptr, val.indices
|
|
val.data, val.indptr, val.indices = None, None, None
|
|
|
|
try:
|
|
# store array-less object
|
|
pickle(val, subname(fname, attrib), protocol=pickle_protocol)
|
|
finally:
|
|
val.data, val.indptr, val.indices = data, indptr, indices
|
|
else:
|
|
logger.info("not storing attribute %s", attrib)
|
|
ignoreds.append(attrib)
|
|
|
|
self.__dict__['__numpys'] = numpys
|
|
self.__dict__['__scipys'] = scipys
|
|
self.__dict__['__ignoreds'] = ignoreds
|
|
self.__dict__['__recursive_saveloads'] = recursive_saveloads
|
|
except Exception:
|
|
# restore the attributes if exception-interrupted
|
|
for attrib, val in iteritems(asides):
|
|
setattr(self, attrib, val)
|
|
raise
|
|
return restores + [(self, asides)]
|
|
|
|
def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2):
|
|
"""Save the object to a file.
|
|
|
|
Parameters
|
|
----------
|
|
fname_or_handle : str or file-like
|
|
Path to output file or already opened file-like object. If the object is a file handle,
|
|
no special array handling will be performed, all attributes will be saved to the same file.
|
|
separately : list of str or None, optional
|
|
If None, automatically detect large numpy/scipy.sparse arrays in the object being stored, and store
|
|
them into separate files. This prevent memory errors for large objects, and also allows
|
|
`memory-mapping <https://en.wikipedia.org/wiki/Mmap>`_ the large arrays for efficient
|
|
loading and sharing the large arrays in RAM between multiple processes.
|
|
|
|
If list of str: store these attributes into separate files. The automated size check
|
|
is not performed in this case.
|
|
sep_limit : int, optional
|
|
Don't store arrays smaller than this separately. In bytes.
|
|
ignore : frozenset of str, optional
|
|
Attributes that shouldn't be stored at all.
|
|
pickle_protocol : int, optional
|
|
Protocol number for pickle.
|
|
|
|
See Also
|
|
--------
|
|
:meth:`~gensim.utils.SaveLoad.load`
|
|
Load object from file.
|
|
|
|
"""
|
|
try:
|
|
_pickle.dump(self, fname_or_handle, protocol=pickle_protocol)
|
|
logger.info("saved %s object", self.__class__.__name__)
|
|
except TypeError: # `fname_or_handle` does not have write attribute
|
|
self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol)
|
|
|
|
|
|
def identity(p):
|
|
"""Identity fnc, for flows that don't accept lambda (pickling etc).
|
|
|
|
Parameters
|
|
----------
|
|
p : object
|
|
Input parameter.
|
|
|
|
Returns
|
|
-------
|
|
object
|
|
Same as `p`.
|
|
|
|
"""
|
|
return p
|
|
|
|
|
|
def get_max_id(corpus):
|
|
"""Get the highest feature id that appears in the corpus.
|
|
|
|
Parameters
|
|
----------
|
|
corpus : iterable of iterable of (int, numeric)
|
|
Collection of texts in BoW format.
|
|
|
|
Returns
|
|
------
|
|
int
|
|
Highest feature id.
|
|
|
|
Notes
|
|
-----
|
|
For empty `corpus` return -1.
|
|
|
|
"""
|
|
maxid = -1
|
|
for document in corpus:
|
|
maxid = max(maxid, max([-1] + [fieldid for fieldid, _ in document])) # [-1] to avoid exceptions from max(empty)
|
|
return maxid
|
|
|
|
|
|
class FakeDict(object):
|
|
"""Objects of this class act as dictionaries that map integer->str(integer), for a specified
|
|
range of integers <0, num_terms).
|
|
|
|
This is meant to avoid allocating real dictionaries when `num_terms` is huge, which is a waste of memory.
|
|
|
|
"""
|
|
def __init__(self, num_terms):
|
|
"""
|
|
|
|
Parameters
|
|
----------
|
|
num_terms : int
|
|
Number of terms.
|
|
|
|
"""
|
|
self.num_terms = num_terms
|
|
|
|
def __str__(self):
|
|
return "FakeDict(num_terms=%s)" % self.num_terms
|
|
|
|
def __getitem__(self, val):
|
|
if 0 <= val < self.num_terms:
|
|
return str(val)
|
|
raise ValueError("internal id out of bounds (%s, expected <0..%s))" % (val, self.num_terms))
|
|
|
|
def iteritems(self):
|
|
"""Iterate over all keys and values.
|
|
|
|
|
|
Yields
|
|
------
|
|
(int, str)
|
|
Pair of (id, token).
|
|
|
|
"""
|
|
for i in xrange(self.num_terms):
|
|
yield i, str(i)
|
|
|
|
def keys(self):
|
|
"""Override the `dict.keys()`, which is used to determine the maximum internal id of a corpus,
|
|
i.e. the vocabulary dimensionality.
|
|
|
|
Returns
|
|
-------
|
|
list of int
|
|
Highest id, packed in list.
|
|
|
|
Notes
|
|
-----
|
|
To avoid materializing the whole `range(0, self.num_terms)`,
|
|
this returns the highest id = `[self.num_terms - 1]` only.
|
|
|
|
"""
|
|
return [self.num_terms - 1]
|
|
|
|
def __len__(self):
|
|
return self.num_terms
|
|
|
|
def get(self, val, default=None):
|
|
if 0 <= val < self.num_terms:
|
|
return str(val)
|
|
return default
|
|
|
|
|
|
def dict_from_corpus(corpus):
|
|
"""Scan corpus for all word ids that appear in it, then construct a mapping
|
|
which maps each `word_id` -> `str(word_id)`.
|
|
|
|
Parameters
|
|
----------
|
|
corpus : iterable of iterable of (int, numeric)
|
|
Collection of texts in BoW format.
|
|
|
|
Returns
|
|
------
|
|
id2word : :class:`~gensim.utils.FakeDict`
|
|
"Fake" mapping which maps each `word_id` -> `str(word_id)`.
|
|
|
|
Warnings
|
|
--------
|
|
This function is used whenever *words* need to be displayed (as opposed to just their ids)
|
|
but no `word_id` -> `word` mapping was provided. The resulting mapping only covers words actually
|
|
used in the corpus, up to the highest `word_id` found.
|
|
|
|
"""
|
|
num_terms = 1 + get_max_id(corpus)
|
|
id2word = FakeDict(num_terms)
|
|
return id2word
|
|
|
|
|
|
def is_corpus(obj):
|
|
"""Check whether `obj` is a corpus, by peeking at its first element. Works even on streamed generators.
|
|
The peeked element is put back into a object returned by this function, so always use
|
|
that returned object instead of the original `obj`.
|
|
|
|
Parameters
|
|
----------
|
|
obj : object
|
|
An `iterable of iterable` that contains (int, numeric).
|
|
|
|
Returns
|
|
-------
|
|
(bool, object)
|
|
Pair of (is `obj` a corpus, `obj` with peeked element restored)
|
|
|
|
Examples
|
|
--------
|
|
>>> from gensim.utils import is_corpus
|
|
>>> corpus = [[(1, 1.0)], [(2, -0.3), (3, 0.12)]]
|
|
>>> corpus_or_not, corpus = is_corpus(corpus)
|
|
|
|
Warnings
|
|
--------
|
|
An "empty" corpus (empty input sequence) is ambiguous, so in this case
|
|
the result is forcefully defined as (False, `obj`).
|
|
|
|
"""
|
|
try:
|
|
if 'Corpus' in obj.__class__.__name__: # the most common case, quick hack
|
|
return True, obj
|
|
except Exception:
|
|
pass
|
|
try:
|
|
if hasattr(obj, 'next') or hasattr(obj, '__next__'):
|
|
# the input is an iterator object, meaning once we call next()
|
|
# that element could be gone forever. we must be careful to put
|
|
# whatever we retrieve back again
|
|
doc1 = next(obj)
|
|
obj = itertools.chain([doc1], obj)
|
|
else:
|
|
doc1 = next(iter(obj)) # empty corpus is resolved to False here
|
|
if len(doc1) == 0: # sparse documents must have a __len__ function (list, tuple...)
|
|
return True, obj # the first document is empty=>assume this is a corpus
|
|
|
|
# if obj is a 1D numpy array(scalars) instead of 2-tuples, it resolves to False here
|
|
id1, val1 = next(iter(doc1))
|
|
id1, val1 = int(id1), float(val1) # must be a 2-tuple (integer, float)
|
|
except Exception:
|
|
return False, obj
|
|
return True, obj
|
|
|
|
|
|
def get_my_ip():
|
|
"""Try to obtain our external ip (from the Pyro4 nameserver's point of view)
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
IP address.
|
|
|
|
Warnings
|
|
--------
|
|
This tries to sidestep the issue of bogus `/etc/hosts` entries and other local misconfiguration,
|
|
which often mess up hostname resolution.
|
|
If all else fails, fall back to simple `socket.gethostbyname()` lookup.
|
|
|
|
"""
|
|
import socket
|
|
try:
|
|
from Pyro4.naming import locateNS
|
|
# we know the nameserver must exist, so use it as our anchor point
|
|
ns = locateNS()
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
s.connect((ns._pyroUri.host, ns._pyroUri.port))
|
|
result, port = s.getsockname()
|
|
except Exception:
|
|
try:
|
|
# see what ifconfig says about our default interface
|
|
import commands
|
|
result = commands.getoutput("ifconfig").split("\n")[1].split()[1][5:]
|
|
if len(result.split('.')) != 4:
|
|
raise Exception()
|
|
except Exception:
|
|
# give up, leave the resolution to gethostbyname
|
|
result = socket.gethostbyname(socket.gethostname())
|
|
return result
|
|
|
|
|
|
class RepeatCorpus(SaveLoad):
|
|
"""Wrap a `corpus` as another corpus of length `reps`. This is achieved by repeating documents from `corpus`
|
|
over and over again, until the requested length `len(result) == reps` is reached.
|
|
Repetition is done on-the-fly=efficiently, via `itertools`.
|
|
|
|
Examples
|
|
--------
|
|
>>> from gensim.utils import RepeatCorpus
|
|
>>>
|
|
>>> corpus = [[(1, 2)], []] # 2 documents
|
|
>>> list(RepeatCorpus(corpus, 5)) # repeat 2.5 times to get 5 documents
|
|
[[(1, 2)], [], [(1, 2)], [], [(1, 2)]]
|
|
|
|
"""
|
|
def __init__(self, corpus, reps):
|
|
"""
|
|
|
|
Parameters
|
|
----------
|
|
corpus : iterable of iterable of (int, numeric)
|
|
Input corpus.
|
|
reps : int
|
|
Number of repeats for documents from corpus.
|
|
|
|
"""
|
|
self.corpus = corpus
|
|
self.reps = reps
|
|
|
|
def __iter__(self):
|
|
return itertools.islice(itertools.cycle(self.corpus), self.reps)
|
|
|
|
|
|
class RepeatCorpusNTimes(SaveLoad):
|
|
"""Wrap a `corpus` and repeat it `n` times.
|
|
|
|
Examples
|
|
--------
|
|
>>> from gensim.utils import RepeatCorpusNTimes
|
|
>>>
|
|
>>> corpus = [[(1, 0.5)], []]
|
|
>>> list(RepeatCorpusNTimes(corpus, 3)) # repeat 3 times
|
|
[[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)], []]
|
|
|
|
"""
|
|
def __init__(self, corpus, n):
|
|
"""
|
|
|
|
Parameters
|
|
----------
|
|
corpus : iterable of iterable of (int, numeric)
|
|
Input corpus.
|
|
n : int
|
|
Number of repeats for corpus.
|
|
|
|
"""
|
|
self.corpus = corpus
|
|
self.n = n
|
|
|
|
def __iter__(self):
|
|
for _ in xrange(self.n):
|
|
for document in self.corpus:
|
|
yield document
|
|
|
|
|
|
class ClippedCorpus(SaveLoad):
|
|
"""Wrap a `corpus` and return `max_doc` element from it."""
|
|
def __init__(self, corpus, max_docs=None):
|
|
"""
|
|
|
|
Parameters
|
|
----------
|
|
corpus : iterable of iterable of (int, numeric)
|
|
Input corpus.
|
|
max_docs : int
|
|
Maximum number of documents in the wrapped corpus.
|
|
|
|
Warnings
|
|
--------
|
|
Any documents after `max_docs` are ignored. This effectively limits the length of the returned corpus
|
|
to <= `max_docs`. Set `max_docs=None` for "no limit", effectively wrapping the entire input corpus.
|
|
|
|
"""
|
|
self.corpus = corpus
|
|
self.max_docs = max_docs
|
|
|
|
def __iter__(self):
|
|
return itertools.islice(self.corpus, self.max_docs)
|
|
|
|
def __len__(self):
|
|
return min(self.max_docs, len(self.corpus))
|
|
|
|
|
|
class SlicedCorpus(SaveLoad):
|
|
"""Wrap `corpus` and return a slice of it."""
|
|
def __init__(self, corpus, slice_):
|
|
"""
|
|
|
|
Parameters
|
|
----------
|
|
corpus : iterable of iterable of (int, numeric)
|
|
Input corpus.
|
|
slice_ : slice or iterable
|
|
Slice for `corpus`.
|
|
|
|
Notes
|
|
-----
|
|
Negative slicing can only be used if the corpus is indexable, otherwise, the corpus will be iterated over.
|
|
Slice can also be a np.ndarray to support fancy indexing.
|
|
|
|
Calculating the size of a SlicedCorpus is expensive when using a slice as the corpus has
|
|
to be iterated over once. Using a list or np.ndarray does not have this drawback, but consumes more memory.
|
|
|
|
"""
|
|
self.corpus = corpus
|
|
self.slice_ = slice_
|
|
self.length = None
|
|
|
|
def __iter__(self):
|
|
if hasattr(self.corpus, 'index') and len(self.corpus.index) > 0:
|
|
return (self.corpus.docbyoffset(i) for i in self.corpus.index[self.slice_])
|
|
return itertools.islice(self.corpus, self.slice_.start, self.slice_.stop, self.slice_.step)
|
|
|
|
def __len__(self):
|
|
# check cached length, calculate if needed
|
|
if self.length is None:
|
|
if isinstance(self.slice_, (list, np.ndarray)):
|
|
self.length = len(self.slice_)
|
|
elif isinstance(self.slice_, slice):
|
|
(start, end, step) = self.slice_.indices(len(self.corpus.index))
|
|
diff = end - start
|
|
self.length = diff // step + (diff % step > 0)
|
|
else:
|
|
self.length = sum(1 for x in self)
|
|
|
|
return self.length
|
|
|
|
|
|
def safe_unichr(intval):
|
|
"""Create a unicode character from its integer value. In case `unichr` fails, render the character
|
|
as an escaped `\\U<8-byte hex value of intval>` string.
|
|
|
|
Parameters
|
|
----------
|
|
intval : int
|
|
Integer code of character
|
|
|
|
Returns
|
|
-------
|
|
string
|
|
Unicode string of character
|
|
|
|
"""
|
|
try:
|
|
return unichr(intval)
|
|
except ValueError:
|
|
# ValueError: unichr() arg not in range(0x10000) (narrow Python build)
|
|
s = "\\U%08x" % intval
|
|
# return UTF16 surrogate pair
|
|
return s.decode('unicode-escape')
|
|
|
|
|
|
def decode_htmlentities(text):
|
|
"""Decode all HTML entities in text that are encoded as hex, decimal or named entities.
|
|
Adapted from `python-twitter-ircbot/html_decode.py
|
|
<http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py>`_.
|
|
|
|
Parameters
|
|
----------
|
|
text : str
|
|
Input HTML.
|
|
|
|
Examples
|
|
--------
|
|
>>> from gensim.utils import decode_htmlentities
|
|
>>>
|
|
>>> u = u'E tu vivrai nel terrore - L'aldilà (1981)'
|
|
>>> print(decode_htmlentities(u).encode('UTF-8'))
|
|
E tu vivrai nel terrore - L'aldilà (1981)
|
|
>>> print(decode_htmlentities("l'eau"))
|
|
l'eau
|
|
>>> print(decode_htmlentities("foo < bar"))
|
|
foo < bar
|
|
|
|
"""
|
|
def substitute_entity(match):
|
|
try:
|
|
ent = match.group(3)
|
|
if match.group(1) == "#":
|
|
# decoding by number
|
|
if match.group(2) == '':
|
|
# number is in decimal
|
|
return safe_unichr(int(ent))
|
|
elif match.group(2) in ['x', 'X']:
|
|
# number is in hex
|
|
return safe_unichr(int(ent, 16))
|
|
else:
|
|
# they were using a name
|
|
cp = n2cp.get(ent)
|
|
if cp:
|
|
return safe_unichr(cp)
|
|
else:
|
|
return match.group()
|
|
except Exception:
|
|
# in case of errors, return original input
|
|
return match.group()
|
|
|
|
return RE_HTML_ENTITY.sub(substitute_entity, text)
|
|
|
|
|
|
def chunkize_serial(iterable, chunksize, as_numpy=False, dtype=np.float32):
|
|
"""Yield elements from `iterable` in "chunksize"-ed groups.
|
|
|
|
The last returned element may be smaller if the length of collection is not divisible by `chunksize`.
|
|
|
|
Parameters
|
|
----------
|
|
iterable : iterable of object
|
|
An iterable.
|
|
chunksize : int
|
|
Split iterable into chunks of this size.
|
|
as_numpy : bool, optional
|
|
Yield chunks as `np.ndarray` instead of lists.
|
|
|
|
Yields
|
|
------
|
|
list OR np.ndarray
|
|
"chunksize"-ed chunks of elements from `iterable`.
|
|
|
|
Examples
|
|
--------
|
|
>>> print(list(grouper(range(10), 3)))
|
|
[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
|
|
|
|
"""
|
|
it = iter(iterable)
|
|
while True:
|
|
if as_numpy:
|
|
# convert each document to a 2d numpy array (~6x faster when transmitting
|
|
# chunk data over the wire, in Pyro)
|
|
wrapped_chunk = [[np.array(doc, dtype=dtype) for doc in itertools.islice(it, int(chunksize))]]
|
|
else:
|
|
wrapped_chunk = [list(itertools.islice(it, int(chunksize)))]
|
|
if not wrapped_chunk[0]:
|
|
break
|
|
# memory opt: wrap the chunk and then pop(), to avoid leaving behind a dangling reference
|
|
yield wrapped_chunk.pop()
|
|
|
|
|
|
grouper = chunkize_serial
|
|
|
|
|
|
class InputQueue(multiprocessing.Process):
|
|
"""Populate a queue of input chunks from a streamed corpus.
|
|
|
|
Useful for reading and chunking corpora in the background, in a separate process,
|
|
so that workers that use the queue are not starved for input chunks.
|
|
|
|
"""
|
|
def __init__(self, q, corpus, chunksize, maxsize, as_numpy):
|
|
"""
|
|
Parameters
|
|
----------
|
|
q : multiprocessing.Queue
|
|
Enqueue chunks into this queue.
|
|
corpus : iterable of iterable of (int, numeric)
|
|
Corpus to read and split into "chunksize"-ed groups
|
|
chunksize : int
|
|
Split `corpus` into chunks of this size.
|
|
as_numpy : bool, optional
|
|
Enqueue chunks as `numpy.ndarray` instead of lists.
|
|
|
|
"""
|
|
super(InputQueue, self).__init__()
|
|
self.q = q
|
|
self.maxsize = maxsize
|
|
self.corpus = corpus
|
|
self.chunksize = chunksize
|
|
self.as_numpy = as_numpy
|
|
|
|
def run(self):
|
|
it = iter(self.corpus)
|
|
while True:
|
|
chunk = itertools.islice(it, self.chunksize)
|
|
if self.as_numpy:
|
|
# HACK XXX convert documents to numpy arrays, to save memory.
|
|
# This also gives a scipy warning at runtime:
|
|
# "UserWarning: indices array has non-integer dtype (float64)"
|
|
wrapped_chunk = [[np.asarray(doc) for doc in chunk]]
|
|
else:
|
|
wrapped_chunk = [list(chunk)]
|
|
|
|
if not wrapped_chunk[0]:
|
|
self.q.put(None, block=True)
|
|
break
|
|
|
|
try:
|
|
qsize = self.q.qsize()
|
|
except NotImplementedError:
|
|
qsize = '?'
|
|
logger.debug("prepared another chunk of %i documents (qsize=%s)", len(wrapped_chunk[0]), qsize)
|
|
self.q.put(wrapped_chunk.pop(), block=True)
|
|
|
|
|
|
if os.name == 'nt':
|
|
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
|
|
|
|
def chunkize(corpus, chunksize, maxsize=0, as_numpy=False):
|
|
"""Split `corpus` into fixed-sized chunks, using :func:`~gensim.utils.chunkize_serial`.
|
|
|
|
Parameters
|
|
----------
|
|
corpus : iterable of object
|
|
An iterable.
|
|
chunksize : int
|
|
Split `corpus` into chunks of this size.
|
|
maxsize : int, optional
|
|
Ignored. For interface compatibility only.
|
|
as_numpy : bool, optional
|
|
Yield chunks as `np.ndarray`s instead of lists?
|
|
|
|
Yields
|
|
------
|
|
list OR np.ndarray
|
|
"chunksize"-ed chunks of elements from `corpus`.
|
|
|
|
"""
|
|
for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy):
|
|
yield chunk
|
|
else:
|
|
def chunkize(corpus, chunksize, maxsize=0, as_numpy=False):
|
|
"""Split `corpus` into fixed-sized chunks, using :func:`~gensim.utils.chunkize_serial`.
|
|
|
|
Parameters
|
|
----------
|
|
corpus : iterable of object
|
|
An iterable.
|
|
chunksize : int
|
|
Split `corpus` into chunks of this size.
|
|
maxsize : int, optional
|
|
If > 0, prepare chunks in a background process, filling a chunk queue of size at most `maxsize`.
|
|
as_numpy : bool, optional
|
|
Yield chunks as `np.ndarray` instead of lists?
|
|
|
|
Yields
|
|
------
|
|
list OR np.ndarray
|
|
"chunksize"-ed chunks of elements from `corpus`.
|
|
|
|
Notes
|
|
-----
|
|
Each chunk is of length `chunksize`, except the last one which may be smaller.
|
|
A once-only input stream (`corpus` from a generator) is ok, chunking is done efficiently via itertools.
|
|
|
|
If `maxsize > 0`, don't wait idly in between successive chunk `yields`, but rather keep filling a short queue
|
|
(of size at most `maxsize`) with forthcoming chunks in advance. This is realized by starting a separate process,
|
|
and is meant to reduce I/O delays, which can be significant when `corpus` comes from a slow medium
|
|
like HDD, database or network.
|
|
|
|
If `maxsize == 0`, don't fool around with parallelism and simply yield the chunksize
|
|
via :func:`~gensim.utils.chunkize_serial` (no I/O optimizations).
|
|
|
|
Yields
|
|
------
|
|
list of object OR np.ndarray
|
|
Groups based on `iterable`
|
|
|
|
"""
|
|
assert chunksize > 0
|
|
|
|
if maxsize > 0:
|
|
q = multiprocessing.Queue(maxsize=maxsize)
|
|
worker = InputQueue(q, corpus, chunksize, maxsize=maxsize, as_numpy=as_numpy)
|
|
worker.daemon = True
|
|
worker.start()
|
|
while True:
|
|
chunk = [q.get(block=True)]
|
|
if chunk[0] is None:
|
|
break
|
|
yield chunk.pop()
|
|
else:
|
|
for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy):
|
|
yield chunk
|
|
|
|
|
|
def smart_extension(fname, ext):
|
|
"""Append a file extension `ext` to `fname`, while keeping compressed extensions like `.bz2` or
|
|
`.gz` (if any) at the end.
|
|
|
|
Parameters
|
|
----------
|
|
fname : str
|
|
Filename or full path.
|
|
ext : str
|
|
Extension to append before any compression extensions.
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
New path to file with `ext` appended.
|
|
|
|
Examples
|
|
--------
|
|
|
|
>>> from gensim.utils import smart_extension
|
|
>>> smart_extension("my_file.pkl.gz", ".vectors")
|
|
'my_file.pkl.vectors.gz'
|
|
|
|
"""
|
|
fname, oext = os.path.splitext(fname)
|
|
if oext.endswith('.bz2'):
|
|
fname = fname + oext[:-4] + ext + '.bz2'
|
|
elif oext.endswith('.gz'):
|
|
fname = fname + oext[:-3] + ext + '.gz'
|
|
else:
|
|
fname = fname + oext + ext
|
|
|
|
return fname
|
|
|
|
|
|
def pickle(obj, fname, protocol=2):
|
|
"""Pickle object `obj` to file `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc.
|
|
|
|
Parameters
|
|
----------
|
|
obj : object
|
|
Any python object.
|
|
fname : str
|
|
Path to pickle file.
|
|
protocol : int, optional
|
|
Pickle protocol number. Default is 2 in order to support compatibility across python 2.x and 3.x.
|
|
|
|
"""
|
|
with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows
|
|
_pickle.dump(obj, fout, protocol=protocol)
|
|
|
|
|
|
def unpickle(fname):
|
|
"""Load object from `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc.
|
|
|
|
Parameters
|
|
----------
|
|
fname : str
|
|
Path to pickle file.
|
|
|
|
Returns
|
|
-------
|
|
object
|
|
Python object loaded from `fname`.
|
|
|
|
"""
|
|
with smart_open(fname, 'rb') as f:
|
|
# Because of loading from S3 load can't be used (missing readline in smart_open)
|
|
if sys.version_info > (3, 0):
|
|
return _pickle.load(f, encoding='latin1')
|
|
else:
|
|
return _pickle.loads(f.read())
|
|
|
|
|
|
def revdict(d):
|
|
"""Reverse a dictionary mapping, i.e. `{1: 2, 3: 4}` -> `{2: 1, 4: 3}`.
|
|
|
|
Parameters
|
|
----------
|
|
d : dict
|
|
Input dictionary.
|
|
|
|
Returns
|
|
-------
|
|
dict
|
|
Reversed dictionary mapping.
|
|
|
|
Notes
|
|
-----
|
|
When two keys map to the same value, only one of them will be kept in the result (which one is kept is arbitrary).
|
|
|
|
Examples
|
|
--------
|
|
>>> from gensim.utils import revdict
|
|
>>> d = {1: 2, 3: 4}
|
|
>>> revdict(d)
|
|
{2: 1, 4: 3}
|
|
|
|
"""
|
|
return {v: k for (k, v) in iteritems(dict(d))}
|
|
|
|
|
|
def deprecated(reason):
|
|
"""Decorator to mark functions as deprecated.
|
|
|
|
Calling a decorated function will result in a warning being emitted, using warnings.warn.
|
|
Adapted from https://stackoverflow.com/a/40301488/8001386.
|
|
|
|
Parameters
|
|
----------
|
|
reason : str
|
|
Reason of deprecation.
|
|
|
|
Returns
|
|
-------
|
|
function
|
|
Decorated function
|
|
|
|
"""
|
|
if isinstance(reason, string_types):
|
|
def decorator(func):
|
|
fmt = "Call to deprecated `{name}` ({reason})."
|
|
|
|
@wraps(func)
|
|
def new_func1(*args, **kwargs):
|
|
warnings.warn(
|
|
fmt.format(name=func.__name__, reason=reason),
|
|
category=DeprecationWarning,
|
|
stacklevel=2
|
|
)
|
|
return func(*args, **kwargs)
|
|
|
|
return new_func1
|
|
return decorator
|
|
|
|
elif inspect.isclass(reason) or inspect.isfunction(reason):
|
|
func = reason
|
|
fmt = "Call to deprecated `{name}`."
|
|
|
|
@wraps(func)
|
|
def new_func2(*args, **kwargs):
|
|
warnings.warn(
|
|
fmt.format(name=func.__name__),
|
|
category=DeprecationWarning,
|
|
stacklevel=2
|
|
)
|
|
return func(*args, **kwargs)
|
|
return new_func2
|
|
|
|
else:
|
|
raise TypeError(repr(type(reason)))
|
|
|
|
|
|
@deprecated("Function will be removed in 4.0.0")
|
|
def toptexts(query, texts, index, n=10):
|
|
"""Debug fnc to help inspect the top `n` most similar documents (according to a similarity index `index`),
|
|
to see if they are actually related to the query.
|
|
|
|
Parameters
|
|
----------
|
|
query : {list of (int, number), numpy.ndarray}
|
|
vector OR BoW (list of tuples)
|
|
texts : str
|
|
object that can return something insightful for each document via `texts[docid]`,
|
|
such as its fulltext or snippet.
|
|
index : any
|
|
A instance from from :mod:`gensim.similarity.docsim`.
|
|
|
|
Return
|
|
------
|
|
list
|
|
a list of 3-tuples (docid, doc's similarity to the query, texts[docid])
|
|
|
|
"""
|
|
sims = index[query] # perform a similarity query against the corpus
|
|
sims = sorted(enumerate(sims), key=lambda item: -item[1])
|
|
|
|
return [(topid, topcosine, texts[topid]) for topid, topcosine in sims[:n]] # only consider top-n most similar docs
|
|
|
|
|
|
def randfname(prefix='gensim'):
|
|
"""Generate a random filename in temp.
|
|
|
|
Parameters
|
|
----------
|
|
prefix : str
|
|
Prefix of filename.
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
Full path in the in system's temporary folder, ending in a random filename.
|
|
|
|
"""
|
|
randpart = hex(random.randint(0, 0xffffff))[2:]
|
|
return os.path.join(tempfile.gettempdir(), prefix + randpart)
|
|
|
|
|
|
@deprecated("Function will be removed in 4.0.0")
|
|
def upload_chunked(server, docs, chunksize=1000, preprocess=None):
|
|
"""Memory-friendly upload of documents to a SimServer (or Pyro SimServer proxy).
|
|
|
|
Notes
|
|
-----
|
|
Use this function to train or index large collections -- avoid sending the
|
|
entire corpus over the wire as a single Pyro in-memory object. The documents
|
|
will be sent in smaller chunks, of `chunksize` documents each.
|
|
|
|
"""
|
|
start = 0
|
|
for chunk in grouper(docs, chunksize):
|
|
end = start + len(chunk)
|
|
logger.info("uploading documents %i-%i", start, end - 1)
|
|
if preprocess is not None:
|
|
pchunk = []
|
|
for doc in chunk:
|
|
doc['tokens'] = preprocess(doc['text'])
|
|
del doc['text']
|
|
pchunk.append(doc)
|
|
chunk = pchunk
|
|
server.buffer(chunk)
|
|
start = end
|
|
|
|
|
|
def getNS(host=None, port=None, broadcast=True, hmac_key=None):
|
|
"""Get a Pyro4 name server proxy.
|
|
|
|
Parameters
|
|
----------
|
|
host : str, optional
|
|
Name server hostname.
|
|
port : int, optional
|
|
Name server port.
|
|
broadcast : bool, optional
|
|
Use broadcast mechanism? (i.e. reach out to all Pyro nodes in the network)
|
|
hmac_key : str, optional
|
|
Private key.
|
|
|
|
Raises
|
|
------
|
|
RuntimeError
|
|
When Pyro name server is not found.
|
|
|
|
Returns
|
|
-------
|
|
:class:`Pyro4.core.Proxy`
|
|
Proxy from Pyro4.
|
|
|
|
"""
|
|
import Pyro4
|
|
try:
|
|
return Pyro4.locateNS(host, port, broadcast, hmac_key)
|
|
except Pyro4.errors.NamingError:
|
|
raise RuntimeError("Pyro name server not found")
|
|
|
|
|
|
def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf=None):
|
|
"""Register an object with the Pyro name server.
|
|
|
|
Start the name server if not running yet and block until the daemon is terminated.
|
|
The object is registered under `name`, or `name`+ some random suffix if `random_suffix` is set.
|
|
|
|
"""
|
|
if ns_conf is None:
|
|
ns_conf = {}
|
|
if random_suffix:
|
|
name += '.' + hex(random.randint(0, 0xffffff))[2:]
|
|
|
|
import Pyro4
|
|
with getNS(**ns_conf) as ns:
|
|
with Pyro4.Daemon(ip or get_my_ip(), port or 0) as daemon:
|
|
# register server for remote access
|
|
uri = daemon.register(obj, name)
|
|
ns.remove(name)
|
|
ns.register(name, uri)
|
|
logger.info("%s registered with nameserver (URI '%s')", name, uri)
|
|
daemon.requestLoop()
|
|
|
|
|
|
def has_pattern():
|
|
"""Check whether the `pattern <https://github.com/clips/pattern>`_ package is installed.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
Is `pattern` installed?
|
|
|
|
"""
|
|
try:
|
|
from pattern.en import parse # noqa:F401
|
|
return True
|
|
except ImportError:
|
|
return False
|
|
|
|
|
|
def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False,
|
|
stopwords=frozenset(), min_length=2, max_length=15):
|
|
"""Use the English lemmatizer from `pattern <https://github.com/clips/pattern>`_ to extract UTF8-encoded tokens in
|
|
their base form aka lemma, e.g. "are, is, being" becomes "be" etc.
|
|
|
|
This is a smarter version of stemming, taking word context into account.
|
|
|
|
Parameters
|
|
----------
|
|
content : str
|
|
Input string
|
|
allowed_tags : :class:`_sre.SRE_Pattern`, optional
|
|
Compiled regexp to select POS that will be used.
|
|
Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
|
|
light : bool, optional
|
|
DEPRECATED FLAG, DOESN'T SUPPORT BY `pattern`.
|
|
stopwords : frozenset, optional
|
|
Set of words that will be removed from output.
|
|
min_length : int, optional
|
|
Minimal token length in output (inclusive).
|
|
max_length : int, optional
|
|
Maximal token length in output (inclusive).
|
|
|
|
Returns
|
|
-------
|
|
list of str
|
|
List with tokens with POS tags.
|
|
|
|
Warnings
|
|
--------
|
|
This function is only available when the optional `pattern <https://github.com/clips/pattern>`_ is installed.
|
|
|
|
Raises
|
|
------
|
|
ImportError
|
|
If `pattern <https://github.com/clips/pattern>`_ not installed.
|
|
|
|
Examples
|
|
--------
|
|
>>> from gensim.utils import lemmatize
|
|
>>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
|
|
['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
|
|
|
|
Note the context-dependent part-of-speech tags between these two examples:
|
|
|
|
>>> lemmatize('The study ranks high.')
|
|
['study/NN', 'rank/VB', 'high/JJ']
|
|
|
|
>>> lemmatize('The ranks study hard.')
|
|
['rank/NN', 'study/VB', 'hard/RB']
|
|
|
|
"""
|
|
if not has_pattern():
|
|
raise ImportError(
|
|
"Pattern library is not installed. Pattern library is needed in order to use lemmatize function"
|
|
)
|
|
from pattern.en import parse
|
|
|
|
if light:
|
|
import warnings
|
|
warnings.warn("The light flag is no longer supported by pattern.")
|
|
|
|
# tokenization in `pattern` is weird; it gets thrown off by non-letters,
|
|
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
|
|
# FIXME this throws away all fancy parsing cues, including sentence structure,
|
|
# abbreviations etc.
|
|
content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
|
|
|
|
parsed = parse(content, lemmata=True, collapse=False)
|
|
result = []
|
|
for sentence in parsed:
|
|
for token, tag, _, _, lemma in sentence:
|
|
if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords:
|
|
if allowed_tags.match(tag):
|
|
lemma += "/" + tag[:2]
|
|
result.append(lemma.encode('utf8'))
|
|
return result
|
|
|
|
|
|
def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0):
|
|
"""Create a random gensim BoW vector, with the feature counts following the Poisson distribution.
|
|
|
|
Parameters
|
|
----------
|
|
dim : int, optional
|
|
Dimension of vector.
|
|
prob_nnz : float, optional
|
|
Probability of each coordinate will be nonzero, will be drawn from the Poisson distribution.
|
|
lam : float, optional
|
|
Lambda parameter for the Poisson distribution.
|
|
|
|
Returns
|
|
-------
|
|
list of (int, float)
|
|
Vector in BoW format.
|
|
|
|
"""
|
|
nnz = np.random.uniform(size=(dim,))
|
|
return [(i, float(np.random.poisson(lam=lam) + 1.0)) for i in xrange(dim) if nnz[i] < prob_nnz]
|
|
|
|
|
|
def mock_data(n_items=1000, dim=1000, prob_nnz=0.5, lam=1.0):
|
|
"""Create a random Gensim-style corpus (BoW), using :func:`~gensim.utils.mock_data_row`.
|
|
|
|
Parameters
|
|
----------
|
|
n_items : int
|
|
Size of corpus
|
|
dim : int
|
|
Dimension of vector, used for :func:`~gensim.utils.mock_data_row`.
|
|
prob_nnz : float, optional
|
|
Probability of each coordinate will be nonzero, will be drawn from Poisson distribution,
|
|
used for :func:`~gensim.utils.mock_data_row`.
|
|
lam : float, optional
|
|
Parameter for Poisson distribution, used for :func:`~gensim.utils.mock_data_row`.
|
|
|
|
Returns
|
|
-------
|
|
list of list of (int, float)
|
|
Gensim-style corpus.
|
|
|
|
"""
|
|
return [mock_data_row(dim=dim, prob_nnz=prob_nnz, lam=lam) for _ in xrange(n_items)]
|
|
|
|
|
|
def prune_vocab(vocab, min_reduce, trim_rule=None):
|
|
"""Remove all entries from the `vocab` dictionary with count smaller than `min_reduce`.
|
|
|
|
Modifies `vocab` in place, returns the sum of all counts that were pruned.
|
|
|
|
Parameters
|
|
----------
|
|
vocab : dict
|
|
Input dictionary.
|
|
min_reduce : int
|
|
Frequency threshold for tokens in `vocab`.
|
|
trim_rule : function, optional
|
|
Function for trimming entities from vocab, default behaviour is `vocab[w] <= min_reduce`.
|
|
|
|
Returns
|
|
-------
|
|
result : int
|
|
Sum of all counts that were pruned.
|
|
|
|
"""
|
|
result = 0
|
|
old_len = len(vocab)
|
|
for w in list(vocab): # make a copy of dict's keys
|
|
if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule): # vocab[w] <= min_reduce:
|
|
result += vocab[w]
|
|
del vocab[w]
|
|
logger.info(
|
|
"pruned out %i tokens with count <=%i (before %i, after %i)",
|
|
old_len - len(vocab), min_reduce, old_len, len(vocab)
|
|
)
|
|
return result
|
|
|
|
|
|
def trim_vocab_by_freq(vocab, topk, trim_rule=None):
|
|
"""Retain `topk` most frequent words in `vocab`.
|
|
If there are more words with the same frequency as `topk`-th one, they will be kept.
|
|
Modifies `vocab` in place, returns nothing.
|
|
|
|
Parameters
|
|
----------
|
|
vocab : dict
|
|
Input dictionary.
|
|
topk : int
|
|
Number of words with highest frequencies to keep.
|
|
trim_rule : function, optional
|
|
Function for trimming entities from vocab, default behaviour is `vocab[w] <= min_count`.
|
|
|
|
"""
|
|
if topk >= len(vocab):
|
|
return
|
|
|
|
min_count = heapq.nlargest(topk, itervalues(vocab))[-1]
|
|
prune_vocab(vocab, min_count, trim_rule=trim_rule)
|
|
|
|
|
|
def merge_counts(dict1, dict2):
|
|
"""Merge `dict1` of (word, freq1) and `dict2` of (word, freq2) into `dict1` of (word, freq1+freq2).
|
|
Parameters
|
|
----------
|
|
dict1 : dict of (str, int)
|
|
First dictionary.
|
|
dict2 : dict of (str, int)
|
|
Second dictionary.
|
|
Returns
|
|
-------
|
|
result : dict
|
|
Merged dictionary with sum of frequencies as values.
|
|
"""
|
|
for word, freq in iteritems(dict2):
|
|
if word in dict1:
|
|
dict1[word] += freq
|
|
else:
|
|
dict1[word] = freq
|
|
|
|
return dict1
|
|
|
|
|
|
def qsize(queue):
|
|
"""Get the (approximate) queue size where available.
|
|
|
|
Parameters
|
|
----------
|
|
queue : :class:`queue.Queue`
|
|
Input queue.
|
|
|
|
Returns
|
|
-------
|
|
int
|
|
Queue size, -1 if `qsize` method isn't implemented (OS X).
|
|
|
|
"""
|
|
try:
|
|
return queue.qsize()
|
|
except NotImplementedError:
|
|
# OS X doesn't support qsize
|
|
return -1
|
|
|
|
|
|
RULE_DEFAULT = 0
|
|
RULE_DISCARD = 1
|
|
RULE_KEEP = 2
|
|
|
|
|
|
def keep_vocab_item(word, count, min_count, trim_rule=None):
|
|
"""Should we keep `word` in the vocab or remove it?
|
|
|
|
Parameters
|
|
----------
|
|
word : str
|
|
Input word.
|
|
count : int
|
|
Number of times that word appeared in a corpus.
|
|
min_count : int
|
|
Discard words with frequency smaller than this.
|
|
trim_rule : function, optional
|
|
Custom function to decide whether to keep or discard this word.
|
|
If a custom `trim_rule` is not specified, the default behaviour is simply `count >= min_count`.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
True if `word` should stay, False otherwise.
|
|
|
|
"""
|
|
default_res = count >= min_count
|
|
|
|
if trim_rule is None:
|
|
return default_res
|
|
else:
|
|
rule_res = trim_rule(word, count, min_count)
|
|
if rule_res == RULE_KEEP:
|
|
return True
|
|
elif rule_res == RULE_DISCARD:
|
|
return False
|
|
else:
|
|
return default_res
|
|
|
|
|
|
def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs):
|
|
r"""Run OS command with the given arguments and return its output as a byte string.
|
|
|
|
Backported from Python 2.7 with a few minor modifications. Widely used for :mod:`gensim.models.wrappers`.
|
|
Behaves very similar to https://docs.python.org/2/library/subprocess.html#subprocess.check_output.
|
|
|
|
Examples
|
|
--------
|
|
>>> from gensim.utils import check_output
|
|
>>> check_output(args=['echo', '1'])
|
|
'1\n'
|
|
|
|
Raises
|
|
------
|
|
KeyboardInterrupt
|
|
If Ctrl+C pressed.
|
|
|
|
"""
|
|
try:
|
|
logger.debug("COMMAND: %s %s", popenargs, kwargs)
|
|
process = subprocess.Popen(stdout=stdout, *popenargs, **kwargs)
|
|
output, unused_err = process.communicate()
|
|
retcode = process.poll()
|
|
if retcode:
|
|
cmd = kwargs.get("args")
|
|
if cmd is None:
|
|
cmd = popenargs[0]
|
|
error = subprocess.CalledProcessError(retcode, cmd)
|
|
error.output = output
|
|
raise error
|
|
return output
|
|
except KeyboardInterrupt:
|
|
process.terminate()
|
|
raise
|
|
|
|
|
|
def sample_dict(d, n=10, use_random=True):
|
|
"""Selected `n` (possibly random) items from the dictionary `d`.
|
|
|
|
Parameters
|
|
----------
|
|
d : dict
|
|
Input dictionary.
|
|
n : int, optional
|
|
Number of items to select.
|
|
use_random : bool, optional
|
|
Select items randomly (without replacement), instead of by the natural dict iteration order?
|
|
|
|
Returns
|
|
-------
|
|
list of (object, object)
|
|
Selected items from dictionary, as a list.
|
|
|
|
"""
|
|
selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(iterkeys(d), n)
|
|
return [(key, d[key]) for key in selected_keys]
|
|
|
|
|
|
def strided_windows(ndarray, window_size):
|
|
"""Produce a numpy.ndarray of windows, as from a sliding window.
|
|
|
|
Parameters
|
|
----------
|
|
ndarray : numpy.ndarray
|
|
Input array
|
|
window_size : int
|
|
Sliding window size.
|
|
|
|
Returns
|
|
-------
|
|
numpy.ndarray
|
|
Subsequences produced by sliding a window of the given size over the `ndarray`.
|
|
Since this uses striding, the individual arrays are views rather than copies of `ndarray`.
|
|
Changes to one view modifies the others and the original.
|
|
|
|
Examples
|
|
--------
|
|
>>> from gensim.utils import strided_windows
|
|
>>> strided_windows(np.arange(5), 2)
|
|
array([[0, 1],
|
|
[1, 2],
|
|
[2, 3],
|
|
[3, 4]])
|
|
>>> strided_windows(np.arange(10), 5)
|
|
array([[0, 1, 2, 3, 4],
|
|
[1, 2, 3, 4, 5],
|
|
[2, 3, 4, 5, 6],
|
|
[3, 4, 5, 6, 7],
|
|
[4, 5, 6, 7, 8],
|
|
[5, 6, 7, 8, 9]])
|
|
|
|
"""
|
|
ndarray = np.asarray(ndarray)
|
|
if window_size == ndarray.shape[0]:
|
|
return np.array([ndarray])
|
|
elif window_size > ndarray.shape[0]:
|
|
return np.ndarray((0, 0))
|
|
|
|
stride = ndarray.strides[0]
|
|
return np.lib.stride_tricks.as_strided(
|
|
ndarray, shape=(ndarray.shape[0] - window_size + 1, window_size),
|
|
strides=(stride, stride))
|
|
|
|
|
|
def iter_windows(texts, window_size, copy=False, ignore_below_size=True, include_doc_num=False):
|
|
"""Produce a generator over the given texts using a sliding window of `window_size`.
|
|
|
|
The windows produced are views of some subsequence of a text.
|
|
To use deep copies instead, pass `copy=True`.
|
|
|
|
Parameters
|
|
----------
|
|
texts : list of str
|
|
List of string sentences.
|
|
window_size : int
|
|
Size of sliding window.
|
|
copy : bool, optional
|
|
Produce deep copies.
|
|
ignore_below_size : bool, optional
|
|
Ignore documents that are not at least `window_size` in length?
|
|
include_doc_num : bool, optional
|
|
Yield the text position with `texts` along with each window?
|
|
|
|
"""
|
|
for doc_num, document in enumerate(texts):
|
|
for window in _iter_windows(document, window_size, copy, ignore_below_size):
|
|
if include_doc_num:
|
|
yield (doc_num, window)
|
|
else:
|
|
yield window
|
|
|
|
|
|
def _iter_windows(document, window_size, copy=False, ignore_below_size=True):
|
|
doc_windows = strided_windows(document, window_size)
|
|
if doc_windows.shape[0] == 0:
|
|
if not ignore_below_size:
|
|
yield document.copy() if copy else document
|
|
else:
|
|
for doc_window in doc_windows:
|
|
yield doc_window.copy() if copy else doc_window
|
|
|
|
|
|
def flatten(nested_list):
|
|
"""Recursively flatten a nested sequence of elements.
|
|
|
|
Parameters
|
|
----------
|
|
nested_list : iterable
|
|
Possibly nested sequence of elements to flatten.
|
|
|
|
Returns
|
|
-------
|
|
list
|
|
Flattened version of `nested_list` where any elements that are an iterable (`collections.Iterable`)
|
|
have been unpacked into the top-level list, in a recursive fashion.
|
|
|
|
"""
|
|
return list(lazy_flatten(nested_list))
|
|
|
|
|
|
def lazy_flatten(nested_list):
|
|
"""Lazy version of :func:`~gensim.utils.flatten`.
|
|
|
|
Parameters
|
|
----------
|
|
nested_list : list
|
|
Possibly nested list.
|
|
|
|
Yields
|
|
------
|
|
object
|
|
Element of list
|
|
|
|
"""
|
|
for el in nested_list:
|
|
if isinstance(el, collections.Iterable) and not isinstance(el, string_types):
|
|
for sub in flatten(el):
|
|
yield sub
|
|
else:
|
|
yield el
|
|
|
|
|
|
def save_as_line_sentence(corpus, filename):
|
|
"""Save the corpus in LineSentence format, i.e. each sentence on a separate line,
|
|
tokens are separated by space.
|
|
|
|
Parameters
|
|
----------
|
|
corpus : iterable of iterables of strings
|
|
|
|
"""
|
|
with smart_open(filename, mode='wb', encoding='utf8') as fout:
|
|
for sentence in corpus:
|
|
line = any2unicode(' '.join(sentence) + '\n')
|
|
fout.write(line)
|
|
|
|
|
|
def effective_n_jobs(n_jobs):
|
|
"""Determines the number of jobs can run in parallel.
|
|
|
|
Just like in sklearn, passing n_jobs=-1 means using all available
|
|
CPU cores.
|
|
|
|
Parameters
|
|
----------
|
|
n_jobs : int
|
|
Number of workers requested by caller.
|
|
|
|
Returns
|
|
-------
|
|
int
|
|
Number of effective jobs.
|
|
|
|
"""
|
|
if n_jobs == 0:
|
|
raise ValueError('n_jobs == 0 in Parallel has no meaning')
|
|
elif n_jobs is None:
|
|
return 1
|
|
elif n_jobs < 0:
|
|
n_jobs = max(cpu_count() + 1 + n_jobs, 1)
|
|
return n_jobs
|