|
|
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- #
- # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
- # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
-
- """Various general utility functions."""
-
- from __future__ import with_statement
- from contextlib import contextmanager
- import collections
- import logging
- import warnings
-
- try:
- from html.entities import name2codepoint as n2cp
- except ImportError:
- from htmlentitydefs import name2codepoint as n2cp
- try:
- import cPickle as _pickle
- except ImportError:
- import pickle as _pickle
-
- import re
- import unicodedata
- import os
- import random
- import itertools
- import tempfile
- from functools import wraps
- import multiprocessing
- import shutil
- import sys
- import subprocess
- import inspect
- import heapq
-
- import numpy as np
- import numbers
- import scipy.sparse
-
- from six import iterkeys, iteritems, itervalues, u, string_types, unichr
- from six.moves import xrange
-
- from smart_open import smart_open
-
- from multiprocessing import cpu_count
-
- if sys.version_info[0] >= 3:
- unicode = str
-
- logger = logging.getLogger(__name__)
-
-
- PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE)
- RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)
-
-
- def get_random_state(seed):
- """Generate :class:`numpy.random.RandomState` based on input seed.
-
- Parameters
- ----------
- seed : {None, int, array_like}
- Seed for random state.
-
- Returns
- -------
- :class:`numpy.random.RandomState`
- Random state.
-
- Raises
- ------
- AttributeError
- If seed is not {None, int, array_like}.
-
- Notes
- -----
- Method originally from `maciejkula/glove-python <https://github.com/maciejkula/glove-python>`_
- and written by `@joshloyal <https://github.com/joshloyal>`_.
-
- """
- if seed is None or seed is np.random:
- return np.random.mtrand._rand
- if isinstance(seed, (numbers.Integral, np.integer)):
- return np.random.RandomState(seed)
- if isinstance(seed, np.random.RandomState):
- return seed
- raise ValueError('%r cannot be used to seed a np.random.RandomState instance' % seed)
-
-
- def synchronous(tlockname):
- """A decorator to place an instance-based lock around a method.
-
- Notes
- -----
- Adapted from http://code.activestate.com/recipes/577105-synchronization-decorator-for-class-methods/.
-
- """
- def _synched(func):
- @wraps(func)
- def _synchronizer(self, *args, **kwargs):
- tlock = getattr(self, tlockname)
- logger.debug("acquiring lock %r for %s", tlockname, func.__name__)
-
- with tlock: # use lock as a context manager to perform safe acquire/release pairs
- logger.debug("acquired lock %r for %s", tlockname, func.__name__)
- result = func(self, *args, **kwargs)
- logger.debug("releasing lock %r for %s", tlockname, func.__name__)
- return result
- return _synchronizer
- return _synched
-
-
- def file_or_filename(input):
- """Open a filename for reading with `smart_open`, or seek to the beginning if `input` is an already open file.
-
- Parameters
- ----------
- input : str or file-like
- Filename or file-like object.
-
- Returns
- -------
- file-like object
- An open file, positioned at the beginning.
-
- """
- if isinstance(input, string_types):
- # input was a filename: open as file
- return smart_open(input)
- else:
- # input already a file-like object; just reset to the beginning
- input.seek(0)
- return input
-
-
- @contextmanager
- def open_file(input):
- """Provide "with-like" behaviour without closing the file object.
-
- Parameters
- ----------
- input : str or file-like
- Filename or file-like object.
-
- Yields
- -------
- file
- File-like object based on input (or input if this already file-like).
-
- """
- mgr = file_or_filename(input)
- exc = False
- try:
- yield mgr
- except Exception:
- # Handling any unhandled exceptions from the code nested in 'with' statement.
- exc = True
- if not isinstance(input, string_types) or not mgr.__exit__(*sys.exc_info()):
- raise
- # Try to introspect and silence errors.
- finally:
- if not exc and isinstance(input, string_types):
- mgr.__exit__(None, None, None)
-
-
- def deaccent(text):
- """Remove letter accents from the given string.
-
- Parameters
- ----------
- text : str
- Input string.
-
- Returns
- -------
- str
- Unicode string without accents.
-
- Examples
- --------
- >>> from gensim.utils import deaccent
- >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
- u'Sef chomutovskych komunistu dostal postou bily prasek'
-
- """
- if not isinstance(text, unicode):
- # assume utf8 for byte strings, use default (strict) error handling
- text = text.decode('utf8')
- norm = unicodedata.normalize("NFD", text)
- result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
- return unicodedata.normalize("NFC", result)
-
-
- def copytree_hardlink(source, dest):
- """Recursively copy a directory ala shutils.copytree, but hardlink files instead of copying.
-
- Parameters
- ----------
- source : str
- Path to source directory
- dest : str
- Path to destination directory
-
- Warnings
- --------
- Available on UNIX systems only.
-
- """
- copy2 = shutil.copy2
- try:
- shutil.copy2 = os.link
- shutil.copytree(source, dest)
- finally:
- shutil.copy2 = copy2
-
-
- def tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict", to_lower=False, lower=False):
- """Iteratively yield tokens as unicode strings, optionally removing accent marks and lowercasing it.
-
- Parameters
- ----------
- text : str or bytes
- Input string.
- deacc : bool, optional
- Remove accentuation using :func:`~gensim.utils.deaccent`?
- encoding : str, optional
- Encoding of input string, used as parameter for :func:`~gensim.utils.to_unicode`.
- errors : str, optional
- Error handling behaviour, used as parameter for :func:`~gensim.utils.to_unicode`.
- lowercase : bool, optional
- Lowercase the input string?
- to_lower : bool, optional
- Same as `lowercase`. Convenience alias.
- lower : bool, optional
- Same as `lowercase`. Convenience alias.
-
- Yields
- ------
- str
- Contiguous sequences of alphabetic characters (no digits!), using :func:`~gensim.utils.simple_tokenize`
-
- Examples
- --------
- >>> from gensim.utils import tokenize
- >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc=True))
- [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']
-
- """
- lowercase = lowercase or to_lower or lower
- text = to_unicode(text, encoding, errors=errors)
- if lowercase:
- text = text.lower()
- if deacc:
- text = deaccent(text)
- return simple_tokenize(text)
-
-
- def simple_tokenize(text):
- """Tokenize input test using :const:`gensim.utils.PAT_ALPHABETIC`.
-
- Parameters
- ----------
- text : str
- Input text.
-
- Yields
- ------
- str
- Tokens from `text`.
-
- """
- for match in PAT_ALPHABETIC.finditer(text):
- yield match.group()
-
-
- def simple_preprocess(doc, deacc=False, min_len=2, max_len=15):
- """Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
-
- Uses :func:`~gensim.utils.tokenize` internally.
-
- Parameters
- ----------
- doc : str
- Input document.
- deacc : bool, optional
- Remove accent marks from tokens using :func:`~gensim.utils.deaccent`?
- min_len : int, optional
- Minimum length of token (inclusive). Shorter tokens are discarded.
- max_len : int, optional
- Maximum length of token in result (inclusive). Longer tokens are discarded.
-
- Returns
- -------
- list of str
- Tokens extracted from `doc`.
-
- """
- tokens = [
- token for token in tokenize(doc, lower=True, deacc=deacc, errors='ignore')
- if min_len <= len(token) <= max_len and not token.startswith('_')
- ]
- return tokens
-
-
- def any2utf8(text, errors='strict', encoding='utf8'):
- """Convert a unicode or bytes string in the given encoding into a utf8 bytestring.
-
- Parameters
- ----------
- text : str
- Input text.
- errors : str, optional
- Error handling behaviour if `text` is a bytestring.
- encoding : str, optional
- Encoding of `text` if it is a bytestring.
-
- Returns
- -------
- str
- Bytestring in utf8.
-
- """
-
- if isinstance(text, unicode):
- return text.encode('utf8')
- # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8
- return unicode(text, encoding, errors=errors).encode('utf8')
-
-
- to_utf8 = any2utf8
-
-
- def any2unicode(text, encoding='utf8', errors='strict'):
- """Convert `text` (bytestring in given encoding or unicode) to unicode.
-
- Parameters
- ----------
- text : str
- Input text.
- errors : str, optional
- Error handling behaviour if `text` is a bytestring.
- encoding : str, optional
- Encoding of `text` if it is a bytestring.
-
- Returns
- -------
- str
- Unicode version of `text`.
-
- """
- if isinstance(text, unicode):
- return text
- return unicode(text, encoding, errors=errors)
-
-
- to_unicode = any2unicode
-
-
- def call_on_class_only(*args, **kwargs):
- """Helper to raise `AttributeError` if a class method is called on an instance. Used internally.
-
- Parameters
- ----------
- *args
- Variable length argument list.
- **kwargs
- Arbitrary keyword arguments.
-
- Raises
- ------
- AttributeError
- If a class method is called on an instance.
-
- """
- raise AttributeError('This method should be called on a class object.')
-
-
- class SaveLoad(object):
- """Serialize/deserialize object from disk, by equipping objects with the save()/load() methods.
-
- Warnings
- --------
- This uses pickle internally (among other techniques), so objects must not contain unpicklable attributes
- such as lambda functions etc.
-
- """
- @classmethod
- def load(cls, fname, mmap=None):
- """Load an object previously saved using :meth:`~gensim.utils.SaveLoad.save` from a file.
-
- Parameters
- ----------
- fname : str
- Path to file that contains needed object.
- mmap : str, optional
- Memory-map option. If the object was saved with large arrays stored separately, you can load these arrays
- via mmap (shared memory) using `mmap='r'.
- If the file being loaded is compressed (either '.gz' or '.bz2'), then `mmap=None` **must be** set.
-
- See Also
- --------
- :meth:`~gensim.utils.SaveLoad.save`
- Save object to file.
-
- Returns
- -------
- object
- Object loaded from `fname`.
-
- Raises
- ------
- AttributeError
- When called on an object instance instead of class (this is a class method).
-
- """
- logger.info("loading %s object from %s", cls.__name__, fname)
-
- compress, subname = SaveLoad._adapt_by_suffix(fname)
-
- obj = unpickle(fname)
- obj._load_specials(fname, mmap, compress, subname)
- logger.info("loaded %s", fname)
- return obj
-
- def _load_specials(self, fname, mmap, compress, subname):
- """Load attributes that were stored separately, and give them the same opportunity
- to recursively load using the :class:`~gensim.utils.SaveLoad` interface.
-
- Parameters
- ----------
- fname : str
- Input file path.
- mmap : {None, ‘r+’, ‘r’, ‘w+’, ‘c’}
- Memory-map options. See `numpy.load(mmap_mode)
- <https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.load.html>`_.
- compress : bool
- Is the input file compressed?
- subname : str
- Attribute name. Set automatically during recursive processing.
-
- """
- def mmap_error(obj, filename):
- return IOError(
- 'Cannot mmap compressed object %s in file %s. ' % (obj, filename) +
- 'Use `load(fname, mmap=None)` or uncompress files manually.'
- )
-
- for attrib in getattr(self, '__recursive_saveloads', []):
- cfname = '.'.join((fname, attrib))
- logger.info("loading %s recursively from %s.* with mmap=%s", attrib, cfname, mmap)
- getattr(self, attrib)._load_specials(cfname, mmap, compress, subname)
-
- for attrib in getattr(self, '__numpys', []):
- logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap)
-
- if compress:
- if mmap:
- raise mmap_error(attrib, subname(fname, attrib))
-
- val = np.load(subname(fname, attrib))['val']
- else:
- val = np.load(subname(fname, attrib), mmap_mode=mmap)
-
- setattr(self, attrib, val)
-
- for attrib in getattr(self, '__scipys', []):
- logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap)
- sparse = unpickle(subname(fname, attrib))
- if compress:
- if mmap:
- raise mmap_error(attrib, subname(fname, attrib))
-
- with np.load(subname(fname, attrib, 'sparse')) as f:
- sparse.data = f['data']
- sparse.indptr = f['indptr']
- sparse.indices = f['indices']
- else:
- sparse.data = np.load(subname(fname, attrib, 'data'), mmap_mode=mmap)
- sparse.indptr = np.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap)
- sparse.indices = np.load(subname(fname, attrib, 'indices'), mmap_mode=mmap)
-
- setattr(self, attrib, sparse)
-
- for attrib in getattr(self, '__ignoreds', []):
- logger.info("setting ignored attribute %s to None", attrib)
- setattr(self, attrib, None)
-
- @staticmethod
- def _adapt_by_suffix(fname):
- """Get compress setting and filename for numpy file compression.
-
- Parameters
- ----------
- fname : str
- Input filename.
-
- Returns
- -------
- (bool, function)
- First argument will be True if `fname` compressed.
-
- """
- compress, suffix = (True, 'npz') if fname.endswith('.gz') or fname.endswith('.bz2') else (False, 'npy')
- return compress, lambda *args: '.'.join(args + (suffix,))
-
- def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2):
- """Save the object to a file. Used internally by :meth:`gensim.utils.SaveLoad.save()`.
-
- Parameters
- ----------
- fname : str
- Path to file.
- separately : list, optional
- Iterable of attributes than need to store distinctly.
- sep_limit : int, optional
- Limit for separation.
- ignore : frozenset, optional
- Attributes that shouldn't be store.
- pickle_protocol : int, optional
- Protocol number for pickle.
-
- Notes
- -----
- If `separately` is None, automatically detect large numpy/scipy.sparse arrays in the object being stored,
- and store them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays back
- on load efficiently.
-
- You can also set `separately` manually, in which case it must be a list of attribute names to be stored
- in separate files. The automatic check is not performed in this case.
-
- """
- logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately)
-
- compress, subname = SaveLoad._adapt_by_suffix(fname)
-
- restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol,
- compress, subname)
- try:
- pickle(self, fname, protocol=pickle_protocol)
- finally:
- # restore attribs handled specially
- for obj, asides in restores:
- for attrib, val in iteritems(asides):
- setattr(obj, attrib, val)
- logger.info("saved %s", fname)
-
- def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname):
- """Save aside any attributes that need to be handled separately, including
- by recursion any attributes that are themselves :class:`~gensim.utils.SaveLoad` instances.
-
- Parameters
- ----------
- fname : str
- Output filename.
- separately : list or None
- List of attributes to store separately.
- sep_limit : int
- Don't store arrays smaller than this separately. In bytes.
- ignore : iterable of str
- Attributes that shouldn't be stored at all.
- pickle_protocol : int
- Protocol number for pickle.
- compress : bool
- If True - compress output with :func:`numpy.savez_compressed`.
- subname : function
- Produced by :meth:`~gensim.utils.SaveLoad._adapt_by_suffix`
-
- Returns
- -------
- list of (obj, {attrib: value, ...})
- Settings that the caller should use to restore each object's attributes that were set aside
- during the default :func:`~gensim.utils.pickle`.
-
- """
- asides = {}
- sparse_matrices = (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)
- if separately is None:
- separately = []
- for attrib, val in iteritems(self.__dict__):
- if isinstance(val, np.ndarray) and val.size >= sep_limit:
- separately.append(attrib)
- elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit:
- separately.append(attrib)
-
- # whatever's in `separately` or `ignore` at this point won't get pickled
- for attrib in separately + list(ignore):
- if hasattr(self, attrib):
- asides[attrib] = getattr(self, attrib)
- delattr(self, attrib)
-
- recursive_saveloads = []
- restores = []
- for attrib, val in iteritems(self.__dict__):
- if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading
- recursive_saveloads.append(attrib)
- cfname = '.'.join((fname, attrib))
- restores.extend(val._save_specials(cfname, None, sep_limit, ignore, pickle_protocol, compress, subname))
-
- try:
- numpys, scipys, ignoreds = [], [], []
- for attrib, val in iteritems(asides):
- if isinstance(val, np.ndarray) and attrib not in ignore:
- numpys.append(attrib)
- logger.info("storing np array '%s' to %s", attrib, subname(fname, attrib))
-
- if compress:
- np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val))
- else:
- np.save(subname(fname, attrib), np.ascontiguousarray(val))
-
- elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore:
- scipys.append(attrib)
- logger.info("storing scipy.sparse array '%s' under %s", attrib, subname(fname, attrib))
-
- if compress:
- np.savez_compressed(
- subname(fname, attrib, 'sparse'),
- data=val.data,
- indptr=val.indptr,
- indices=val.indices
- )
- else:
- np.save(subname(fname, attrib, 'data'), val.data)
- np.save(subname(fname, attrib, 'indptr'), val.indptr)
- np.save(subname(fname, attrib, 'indices'), val.indices)
-
- data, indptr, indices = val.data, val.indptr, val.indices
- val.data, val.indptr, val.indices = None, None, None
-
- try:
- # store array-less object
- pickle(val, subname(fname, attrib), protocol=pickle_protocol)
- finally:
- val.data, val.indptr, val.indices = data, indptr, indices
- else:
- logger.info("not storing attribute %s", attrib)
- ignoreds.append(attrib)
-
- self.__dict__['__numpys'] = numpys
- self.__dict__['__scipys'] = scipys
- self.__dict__['__ignoreds'] = ignoreds
- self.__dict__['__recursive_saveloads'] = recursive_saveloads
- except Exception:
- # restore the attributes if exception-interrupted
- for attrib, val in iteritems(asides):
- setattr(self, attrib, val)
- raise
- return restores + [(self, asides)]
-
- def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2):
- """Save the object to a file.
-
- Parameters
- ----------
- fname_or_handle : str or file-like
- Path to output file or already opened file-like object. If the object is a file handle,
- no special array handling will be performed, all attributes will be saved to the same file.
- separately : list of str or None, optional
- If None, automatically detect large numpy/scipy.sparse arrays in the object being stored, and store
- them into separate files. This prevent memory errors for large objects, and also allows
- `memory-mapping <https://en.wikipedia.org/wiki/Mmap>`_ the large arrays for efficient
- loading and sharing the large arrays in RAM between multiple processes.
-
- If list of str: store these attributes into separate files. The automated size check
- is not performed in this case.
- sep_limit : int, optional
- Don't store arrays smaller than this separately. In bytes.
- ignore : frozenset of str, optional
- Attributes that shouldn't be stored at all.
- pickle_protocol : int, optional
- Protocol number for pickle.
-
- See Also
- --------
- :meth:`~gensim.utils.SaveLoad.load`
- Load object from file.
-
- """
- try:
- _pickle.dump(self, fname_or_handle, protocol=pickle_protocol)
- logger.info("saved %s object", self.__class__.__name__)
- except TypeError: # `fname_or_handle` does not have write attribute
- self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol)
-
-
- def identity(p):
- """Identity fnc, for flows that don't accept lambda (pickling etc).
-
- Parameters
- ----------
- p : object
- Input parameter.
-
- Returns
- -------
- object
- Same as `p`.
-
- """
- return p
-
-
- def get_max_id(corpus):
- """Get the highest feature id that appears in the corpus.
-
- Parameters
- ----------
- corpus : iterable of iterable of (int, numeric)
- Collection of texts in BoW format.
-
- Returns
- ------
- int
- Highest feature id.
-
- Notes
- -----
- For empty `corpus` return -1.
-
- """
- maxid = -1
- for document in corpus:
- maxid = max(maxid, max([-1] + [fieldid for fieldid, _ in document])) # [-1] to avoid exceptions from max(empty)
- return maxid
-
-
- class FakeDict(object):
- """Objects of this class act as dictionaries that map integer->str(integer), for a specified
- range of integers <0, num_terms).
-
- This is meant to avoid allocating real dictionaries when `num_terms` is huge, which is a waste of memory.
-
- """
- def __init__(self, num_terms):
- """
-
- Parameters
- ----------
- num_terms : int
- Number of terms.
-
- """
- self.num_terms = num_terms
-
- def __str__(self):
- return "FakeDict(num_terms=%s)" % self.num_terms
-
- def __getitem__(self, val):
- if 0 <= val < self.num_terms:
- return str(val)
- raise ValueError("internal id out of bounds (%s, expected <0..%s))" % (val, self.num_terms))
-
- def iteritems(self):
- """Iterate over all keys and values.
-
-
- Yields
- ------
- (int, str)
- Pair of (id, token).
-
- """
- for i in xrange(self.num_terms):
- yield i, str(i)
-
- def keys(self):
- """Override the `dict.keys()`, which is used to determine the maximum internal id of a corpus,
- i.e. the vocabulary dimensionality.
-
- Returns
- -------
- list of int
- Highest id, packed in list.
-
- Notes
- -----
- To avoid materializing the whole `range(0, self.num_terms)`,
- this returns the highest id = `[self.num_terms - 1]` only.
-
- """
- return [self.num_terms - 1]
-
- def __len__(self):
- return self.num_terms
-
- def get(self, val, default=None):
- if 0 <= val < self.num_terms:
- return str(val)
- return default
-
-
- def dict_from_corpus(corpus):
- """Scan corpus for all word ids that appear in it, then construct a mapping
- which maps each `word_id` -> `str(word_id)`.
-
- Parameters
- ----------
- corpus : iterable of iterable of (int, numeric)
- Collection of texts in BoW format.
-
- Returns
- ------
- id2word : :class:`~gensim.utils.FakeDict`
- "Fake" mapping which maps each `word_id` -> `str(word_id)`.
-
- Warnings
- --------
- This function is used whenever *words* need to be displayed (as opposed to just their ids)
- but no `word_id` -> `word` mapping was provided. The resulting mapping only covers words actually
- used in the corpus, up to the highest `word_id` found.
-
- """
- num_terms = 1 + get_max_id(corpus)
- id2word = FakeDict(num_terms)
- return id2word
-
-
- def is_corpus(obj):
- """Check whether `obj` is a corpus, by peeking at its first element. Works even on streamed generators.
- The peeked element is put back into a object returned by this function, so always use
- that returned object instead of the original `obj`.
-
- Parameters
- ----------
- obj : object
- An `iterable of iterable` that contains (int, numeric).
-
- Returns
- -------
- (bool, object)
- Pair of (is `obj` a corpus, `obj` with peeked element restored)
-
- Examples
- --------
- >>> from gensim.utils import is_corpus
- >>> corpus = [[(1, 1.0)], [(2, -0.3), (3, 0.12)]]
- >>> corpus_or_not, corpus = is_corpus(corpus)
-
- Warnings
- --------
- An "empty" corpus (empty input sequence) is ambiguous, so in this case
- the result is forcefully defined as (False, `obj`).
-
- """
- try:
- if 'Corpus' in obj.__class__.__name__: # the most common case, quick hack
- return True, obj
- except Exception:
- pass
- try:
- if hasattr(obj, 'next') or hasattr(obj, '__next__'):
- # the input is an iterator object, meaning once we call next()
- # that element could be gone forever. we must be careful to put
- # whatever we retrieve back again
- doc1 = next(obj)
- obj = itertools.chain([doc1], obj)
- else:
- doc1 = next(iter(obj)) # empty corpus is resolved to False here
- if len(doc1) == 0: # sparse documents must have a __len__ function (list, tuple...)
- return True, obj # the first document is empty=>assume this is a corpus
-
- # if obj is a 1D numpy array(scalars) instead of 2-tuples, it resolves to False here
- id1, val1 = next(iter(doc1))
- id1, val1 = int(id1), float(val1) # must be a 2-tuple (integer, float)
- except Exception:
- return False, obj
- return True, obj
-
-
- def get_my_ip():
- """Try to obtain our external ip (from the Pyro4 nameserver's point of view)
-
- Returns
- -------
- str
- IP address.
-
- Warnings
- --------
- This tries to sidestep the issue of bogus `/etc/hosts` entries and other local misconfiguration,
- which often mess up hostname resolution.
- If all else fails, fall back to simple `socket.gethostbyname()` lookup.
-
- """
- import socket
- try:
- from Pyro4.naming import locateNS
- # we know the nameserver must exist, so use it as our anchor point
- ns = locateNS()
- s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
- s.connect((ns._pyroUri.host, ns._pyroUri.port))
- result, port = s.getsockname()
- except Exception:
- try:
- # see what ifconfig says about our default interface
- import commands
- result = commands.getoutput("ifconfig").split("\n")[1].split()[1][5:]
- if len(result.split('.')) != 4:
- raise Exception()
- except Exception:
- # give up, leave the resolution to gethostbyname
- result = socket.gethostbyname(socket.gethostname())
- return result
-
-
- class RepeatCorpus(SaveLoad):
- """Wrap a `corpus` as another corpus of length `reps`. This is achieved by repeating documents from `corpus`
- over and over again, until the requested length `len(result) == reps` is reached.
- Repetition is done on-the-fly=efficiently, via `itertools`.
-
- Examples
- --------
- >>> from gensim.utils import RepeatCorpus
- >>>
- >>> corpus = [[(1, 2)], []] # 2 documents
- >>> list(RepeatCorpus(corpus, 5)) # repeat 2.5 times to get 5 documents
- [[(1, 2)], [], [(1, 2)], [], [(1, 2)]]
-
- """
- def __init__(self, corpus, reps):
- """
-
- Parameters
- ----------
- corpus : iterable of iterable of (int, numeric)
- Input corpus.
- reps : int
- Number of repeats for documents from corpus.
-
- """
- self.corpus = corpus
- self.reps = reps
-
- def __iter__(self):
- return itertools.islice(itertools.cycle(self.corpus), self.reps)
-
-
- class RepeatCorpusNTimes(SaveLoad):
- """Wrap a `corpus` and repeat it `n` times.
-
- Examples
- --------
- >>> from gensim.utils import RepeatCorpusNTimes
- >>>
- >>> corpus = [[(1, 0.5)], []]
- >>> list(RepeatCorpusNTimes(corpus, 3)) # repeat 3 times
- [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)], []]
-
- """
- def __init__(self, corpus, n):
- """
-
- Parameters
- ----------
- corpus : iterable of iterable of (int, numeric)
- Input corpus.
- n : int
- Number of repeats for corpus.
-
- """
- self.corpus = corpus
- self.n = n
-
- def __iter__(self):
- for _ in xrange(self.n):
- for document in self.corpus:
- yield document
-
-
- class ClippedCorpus(SaveLoad):
- """Wrap a `corpus` and return `max_doc` element from it."""
- def __init__(self, corpus, max_docs=None):
- """
-
- Parameters
- ----------
- corpus : iterable of iterable of (int, numeric)
- Input corpus.
- max_docs : int
- Maximum number of documents in the wrapped corpus.
-
- Warnings
- --------
- Any documents after `max_docs` are ignored. This effectively limits the length of the returned corpus
- to <= `max_docs`. Set `max_docs=None` for "no limit", effectively wrapping the entire input corpus.
-
- """
- self.corpus = corpus
- self.max_docs = max_docs
-
- def __iter__(self):
- return itertools.islice(self.corpus, self.max_docs)
-
- def __len__(self):
- return min(self.max_docs, len(self.corpus))
-
-
- class SlicedCorpus(SaveLoad):
- """Wrap `corpus` and return a slice of it."""
- def __init__(self, corpus, slice_):
- """
-
- Parameters
- ----------
- corpus : iterable of iterable of (int, numeric)
- Input corpus.
- slice_ : slice or iterable
- Slice for `corpus`.
-
- Notes
- -----
- Negative slicing can only be used if the corpus is indexable, otherwise, the corpus will be iterated over.
- Slice can also be a np.ndarray to support fancy indexing.
-
- Calculating the size of a SlicedCorpus is expensive when using a slice as the corpus has
- to be iterated over once. Using a list or np.ndarray does not have this drawback, but consumes more memory.
-
- """
- self.corpus = corpus
- self.slice_ = slice_
- self.length = None
-
- def __iter__(self):
- if hasattr(self.corpus, 'index') and len(self.corpus.index) > 0:
- return (self.corpus.docbyoffset(i) for i in self.corpus.index[self.slice_])
- return itertools.islice(self.corpus, self.slice_.start, self.slice_.stop, self.slice_.step)
-
- def __len__(self):
- # check cached length, calculate if needed
- if self.length is None:
- if isinstance(self.slice_, (list, np.ndarray)):
- self.length = len(self.slice_)
- elif isinstance(self.slice_, slice):
- (start, end, step) = self.slice_.indices(len(self.corpus.index))
- diff = end - start
- self.length = diff // step + (diff % step > 0)
- else:
- self.length = sum(1 for x in self)
-
- return self.length
-
-
- def safe_unichr(intval):
- """Create a unicode character from its integer value. In case `unichr` fails, render the character
- as an escaped `\\U<8-byte hex value of intval>` string.
-
- Parameters
- ----------
- intval : int
- Integer code of character
-
- Returns
- -------
- string
- Unicode string of character
-
- """
- try:
- return unichr(intval)
- except ValueError:
- # ValueError: unichr() arg not in range(0x10000) (narrow Python build)
- s = "\\U%08x" % intval
- # return UTF16 surrogate pair
- return s.decode('unicode-escape')
-
-
- def decode_htmlentities(text):
- """Decode all HTML entities in text that are encoded as hex, decimal or named entities.
- Adapted from `python-twitter-ircbot/html_decode.py
- <http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py>`_.
-
- Parameters
- ----------
- text : str
- Input HTML.
-
- Examples
- --------
- >>> from gensim.utils import decode_htmlentities
- >>>
- >>> u = u'E tu vivrai nel terrore - L'aldilà (1981)'
- >>> print(decode_htmlentities(u).encode('UTF-8'))
- E tu vivrai nel terrore - L'aldilà (1981)
- >>> print(decode_htmlentities("l'eau"))
- l'eau
- >>> print(decode_htmlentities("foo < bar"))
- foo < bar
-
- """
- def substitute_entity(match):
- try:
- ent = match.group(3)
- if match.group(1) == "#":
- # decoding by number
- if match.group(2) == '':
- # number is in decimal
- return safe_unichr(int(ent))
- elif match.group(2) in ['x', 'X']:
- # number is in hex
- return safe_unichr(int(ent, 16))
- else:
- # they were using a name
- cp = n2cp.get(ent)
- if cp:
- return safe_unichr(cp)
- else:
- return match.group()
- except Exception:
- # in case of errors, return original input
- return match.group()
-
- return RE_HTML_ENTITY.sub(substitute_entity, text)
-
-
- def chunkize_serial(iterable, chunksize, as_numpy=False, dtype=np.float32):
- """Yield elements from `iterable` in "chunksize"-ed groups.
-
- The last returned element may be smaller if the length of collection is not divisible by `chunksize`.
-
- Parameters
- ----------
- iterable : iterable of object
- An iterable.
- chunksize : int
- Split iterable into chunks of this size.
- as_numpy : bool, optional
- Yield chunks as `np.ndarray` instead of lists.
-
- Yields
- ------
- list OR np.ndarray
- "chunksize"-ed chunks of elements from `iterable`.
-
- Examples
- --------
- >>> print(list(grouper(range(10), 3)))
- [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
-
- """
- it = iter(iterable)
- while True:
- if as_numpy:
- # convert each document to a 2d numpy array (~6x faster when transmitting
- # chunk data over the wire, in Pyro)
- wrapped_chunk = [[np.array(doc, dtype=dtype) for doc in itertools.islice(it, int(chunksize))]]
- else:
- wrapped_chunk = [list(itertools.islice(it, int(chunksize)))]
- if not wrapped_chunk[0]:
- break
- # memory opt: wrap the chunk and then pop(), to avoid leaving behind a dangling reference
- yield wrapped_chunk.pop()
-
-
- grouper = chunkize_serial
-
-
- class InputQueue(multiprocessing.Process):
- """Populate a queue of input chunks from a streamed corpus.
-
- Useful for reading and chunking corpora in the background, in a separate process,
- so that workers that use the queue are not starved for input chunks.
-
- """
- def __init__(self, q, corpus, chunksize, maxsize, as_numpy):
- """
- Parameters
- ----------
- q : multiprocessing.Queue
- Enqueue chunks into this queue.
- corpus : iterable of iterable of (int, numeric)
- Corpus to read and split into "chunksize"-ed groups
- chunksize : int
- Split `corpus` into chunks of this size.
- as_numpy : bool, optional
- Enqueue chunks as `numpy.ndarray` instead of lists.
-
- """
- super(InputQueue, self).__init__()
- self.q = q
- self.maxsize = maxsize
- self.corpus = corpus
- self.chunksize = chunksize
- self.as_numpy = as_numpy
-
- def run(self):
- it = iter(self.corpus)
- while True:
- chunk = itertools.islice(it, self.chunksize)
- if self.as_numpy:
- # HACK XXX convert documents to numpy arrays, to save memory.
- # This also gives a scipy warning at runtime:
- # "UserWarning: indices array has non-integer dtype (float64)"
- wrapped_chunk = [[np.asarray(doc) for doc in chunk]]
- else:
- wrapped_chunk = [list(chunk)]
-
- if not wrapped_chunk[0]:
- self.q.put(None, block=True)
- break
-
- try:
- qsize = self.q.qsize()
- except NotImplementedError:
- qsize = '?'
- logger.debug("prepared another chunk of %i documents (qsize=%s)", len(wrapped_chunk[0]), qsize)
- self.q.put(wrapped_chunk.pop(), block=True)
-
-
- if os.name == 'nt':
- warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
-
- def chunkize(corpus, chunksize, maxsize=0, as_numpy=False):
- """Split `corpus` into fixed-sized chunks, using :func:`~gensim.utils.chunkize_serial`.
-
- Parameters
- ----------
- corpus : iterable of object
- An iterable.
- chunksize : int
- Split `corpus` into chunks of this size.
- maxsize : int, optional
- Ignored. For interface compatibility only.
- as_numpy : bool, optional
- Yield chunks as `np.ndarray`s instead of lists?
-
- Yields
- ------
- list OR np.ndarray
- "chunksize"-ed chunks of elements from `corpus`.
-
- """
- for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy):
- yield chunk
- else:
- def chunkize(corpus, chunksize, maxsize=0, as_numpy=False):
- """Split `corpus` into fixed-sized chunks, using :func:`~gensim.utils.chunkize_serial`.
-
- Parameters
- ----------
- corpus : iterable of object
- An iterable.
- chunksize : int
- Split `corpus` into chunks of this size.
- maxsize : int, optional
- If > 0, prepare chunks in a background process, filling a chunk queue of size at most `maxsize`.
- as_numpy : bool, optional
- Yield chunks as `np.ndarray` instead of lists?
-
- Yields
- ------
- list OR np.ndarray
- "chunksize"-ed chunks of elements from `corpus`.
-
- Notes
- -----
- Each chunk is of length `chunksize`, except the last one which may be smaller.
- A once-only input stream (`corpus` from a generator) is ok, chunking is done efficiently via itertools.
-
- If `maxsize > 0`, don't wait idly in between successive chunk `yields`, but rather keep filling a short queue
- (of size at most `maxsize`) with forthcoming chunks in advance. This is realized by starting a separate process,
- and is meant to reduce I/O delays, which can be significant when `corpus` comes from a slow medium
- like HDD, database or network.
-
- If `maxsize == 0`, don't fool around with parallelism and simply yield the chunksize
- via :func:`~gensim.utils.chunkize_serial` (no I/O optimizations).
-
- Yields
- ------
- list of object OR np.ndarray
- Groups based on `iterable`
-
- """
- assert chunksize > 0
-
- if maxsize > 0:
- q = multiprocessing.Queue(maxsize=maxsize)
- worker = InputQueue(q, corpus, chunksize, maxsize=maxsize, as_numpy=as_numpy)
- worker.daemon = True
- worker.start()
- while True:
- chunk = [q.get(block=True)]
- if chunk[0] is None:
- break
- yield chunk.pop()
- else:
- for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy):
- yield chunk
-
-
- def smart_extension(fname, ext):
- """Append a file extension `ext` to `fname`, while keeping compressed extensions like `.bz2` or
- `.gz` (if any) at the end.
-
- Parameters
- ----------
- fname : str
- Filename or full path.
- ext : str
- Extension to append before any compression extensions.
-
- Returns
- -------
- str
- New path to file with `ext` appended.
-
- Examples
- --------
-
- >>> from gensim.utils import smart_extension
- >>> smart_extension("my_file.pkl.gz", ".vectors")
- 'my_file.pkl.vectors.gz'
-
- """
- fname, oext = os.path.splitext(fname)
- if oext.endswith('.bz2'):
- fname = fname + oext[:-4] + ext + '.bz2'
- elif oext.endswith('.gz'):
- fname = fname + oext[:-3] + ext + '.gz'
- else:
- fname = fname + oext + ext
-
- return fname
-
-
- def pickle(obj, fname, protocol=2):
- """Pickle object `obj` to file `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc.
-
- Parameters
- ----------
- obj : object
- Any python object.
- fname : str
- Path to pickle file.
- protocol : int, optional
- Pickle protocol number. Default is 2 in order to support compatibility across python 2.x and 3.x.
-
- """
- with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows
- _pickle.dump(obj, fout, protocol=protocol)
-
-
- def unpickle(fname):
- """Load object from `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc.
-
- Parameters
- ----------
- fname : str
- Path to pickle file.
-
- Returns
- -------
- object
- Python object loaded from `fname`.
-
- """
- with smart_open(fname, 'rb') as f:
- # Because of loading from S3 load can't be used (missing readline in smart_open)
- if sys.version_info > (3, 0):
- return _pickle.load(f, encoding='latin1')
- else:
- return _pickle.loads(f.read())
-
-
- def revdict(d):
- """Reverse a dictionary mapping, i.e. `{1: 2, 3: 4}` -> `{2: 1, 4: 3}`.
-
- Parameters
- ----------
- d : dict
- Input dictionary.
-
- Returns
- -------
- dict
- Reversed dictionary mapping.
-
- Notes
- -----
- When two keys map to the same value, only one of them will be kept in the result (which one is kept is arbitrary).
-
- Examples
- --------
- >>> from gensim.utils import revdict
- >>> d = {1: 2, 3: 4}
- >>> revdict(d)
- {2: 1, 4: 3}
-
- """
- return {v: k for (k, v) in iteritems(dict(d))}
-
-
- def deprecated(reason):
- """Decorator to mark functions as deprecated.
-
- Calling a decorated function will result in a warning being emitted, using warnings.warn.
- Adapted from https://stackoverflow.com/a/40301488/8001386.
-
- Parameters
- ----------
- reason : str
- Reason of deprecation.
-
- Returns
- -------
- function
- Decorated function
-
- """
- if isinstance(reason, string_types):
- def decorator(func):
- fmt = "Call to deprecated `{name}` ({reason})."
-
- @wraps(func)
- def new_func1(*args, **kwargs):
- warnings.warn(
- fmt.format(name=func.__name__, reason=reason),
- category=DeprecationWarning,
- stacklevel=2
- )
- return func(*args, **kwargs)
-
- return new_func1
- return decorator
-
- elif inspect.isclass(reason) or inspect.isfunction(reason):
- func = reason
- fmt = "Call to deprecated `{name}`."
-
- @wraps(func)
- def new_func2(*args, **kwargs):
- warnings.warn(
- fmt.format(name=func.__name__),
- category=DeprecationWarning,
- stacklevel=2
- )
- return func(*args, **kwargs)
- return new_func2
-
- else:
- raise TypeError(repr(type(reason)))
-
-
- @deprecated("Function will be removed in 4.0.0")
- def toptexts(query, texts, index, n=10):
- """Debug fnc to help inspect the top `n` most similar documents (according to a similarity index `index`),
- to see if they are actually related to the query.
-
- Parameters
- ----------
- query : {list of (int, number), numpy.ndarray}
- vector OR BoW (list of tuples)
- texts : str
- object that can return something insightful for each document via `texts[docid]`,
- such as its fulltext or snippet.
- index : any
- A instance from from :mod:`gensim.similarity.docsim`.
-
- Return
- ------
- list
- a list of 3-tuples (docid, doc's similarity to the query, texts[docid])
-
- """
- sims = index[query] # perform a similarity query against the corpus
- sims = sorted(enumerate(sims), key=lambda item: -item[1])
-
- return [(topid, topcosine, texts[topid]) for topid, topcosine in sims[:n]] # only consider top-n most similar docs
-
-
- def randfname(prefix='gensim'):
- """Generate a random filename in temp.
-
- Parameters
- ----------
- prefix : str
- Prefix of filename.
-
- Returns
- -------
- str
- Full path in the in system's temporary folder, ending in a random filename.
-
- """
- randpart = hex(random.randint(0, 0xffffff))[2:]
- return os.path.join(tempfile.gettempdir(), prefix + randpart)
-
-
- @deprecated("Function will be removed in 4.0.0")
- def upload_chunked(server, docs, chunksize=1000, preprocess=None):
- """Memory-friendly upload of documents to a SimServer (or Pyro SimServer proxy).
-
- Notes
- -----
- Use this function to train or index large collections -- avoid sending the
- entire corpus over the wire as a single Pyro in-memory object. The documents
- will be sent in smaller chunks, of `chunksize` documents each.
-
- """
- start = 0
- for chunk in grouper(docs, chunksize):
- end = start + len(chunk)
- logger.info("uploading documents %i-%i", start, end - 1)
- if preprocess is not None:
- pchunk = []
- for doc in chunk:
- doc['tokens'] = preprocess(doc['text'])
- del doc['text']
- pchunk.append(doc)
- chunk = pchunk
- server.buffer(chunk)
- start = end
-
-
- def getNS(host=None, port=None, broadcast=True, hmac_key=None):
- """Get a Pyro4 name server proxy.
-
- Parameters
- ----------
- host : str, optional
- Name server hostname.
- port : int, optional
- Name server port.
- broadcast : bool, optional
- Use broadcast mechanism? (i.e. reach out to all Pyro nodes in the network)
- hmac_key : str, optional
- Private key.
-
- Raises
- ------
- RuntimeError
- When Pyro name server is not found.
-
- Returns
- -------
- :class:`Pyro4.core.Proxy`
- Proxy from Pyro4.
-
- """
- import Pyro4
- try:
- return Pyro4.locateNS(host, port, broadcast, hmac_key)
- except Pyro4.errors.NamingError:
- raise RuntimeError("Pyro name server not found")
-
-
- def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf=None):
- """Register an object with the Pyro name server.
-
- Start the name server if not running yet and block until the daemon is terminated.
- The object is registered under `name`, or `name`+ some random suffix if `random_suffix` is set.
-
- """
- if ns_conf is None:
- ns_conf = {}
- if random_suffix:
- name += '.' + hex(random.randint(0, 0xffffff))[2:]
-
- import Pyro4
- with getNS(**ns_conf) as ns:
- with Pyro4.Daemon(ip or get_my_ip(), port or 0) as daemon:
- # register server for remote access
- uri = daemon.register(obj, name)
- ns.remove(name)
- ns.register(name, uri)
- logger.info("%s registered with nameserver (URI '%s')", name, uri)
- daemon.requestLoop()
-
-
- def has_pattern():
- """Check whether the `pattern <https://github.com/clips/pattern>`_ package is installed.
-
- Returns
- -------
- bool
- Is `pattern` installed?
-
- """
- try:
- from pattern.en import parse # noqa:F401
- return True
- except ImportError:
- return False
-
-
- def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False,
- stopwords=frozenset(), min_length=2, max_length=15):
- """Use the English lemmatizer from `pattern <https://github.com/clips/pattern>`_ to extract UTF8-encoded tokens in
- their base form aka lemma, e.g. "are, is, being" becomes "be" etc.
-
- This is a smarter version of stemming, taking word context into account.
-
- Parameters
- ----------
- content : str
- Input string
- allowed_tags : :class:`_sre.SRE_Pattern`, optional
- Compiled regexp to select POS that will be used.
- Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
- light : bool, optional
- DEPRECATED FLAG, DOESN'T SUPPORT BY `pattern`.
- stopwords : frozenset, optional
- Set of words that will be removed from output.
- min_length : int, optional
- Minimal token length in output (inclusive).
- max_length : int, optional
- Maximal token length in output (inclusive).
-
- Returns
- -------
- list of str
- List with tokens with POS tags.
-
- Warnings
- --------
- This function is only available when the optional `pattern <https://github.com/clips/pattern>`_ is installed.
-
- Raises
- ------
- ImportError
- If `pattern <https://github.com/clips/pattern>`_ not installed.
-
- Examples
- --------
- >>> from gensim.utils import lemmatize
- >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
- ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
-
- Note the context-dependent part-of-speech tags between these two examples:
-
- >>> lemmatize('The study ranks high.')
- ['study/NN', 'rank/VB', 'high/JJ']
-
- >>> lemmatize('The ranks study hard.')
- ['rank/NN', 'study/VB', 'hard/RB']
-
- """
- if not has_pattern():
- raise ImportError(
- "Pattern library is not installed. Pattern library is needed in order to use lemmatize function"
- )
- from pattern.en import parse
-
- if light:
- import warnings
- warnings.warn("The light flag is no longer supported by pattern.")
-
- # tokenization in `pattern` is weird; it gets thrown off by non-letters,
- # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
- # FIXME this throws away all fancy parsing cues, including sentence structure,
- # abbreviations etc.
- content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
-
- parsed = parse(content, lemmata=True, collapse=False)
- result = []
- for sentence in parsed:
- for token, tag, _, _, lemma in sentence:
- if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords:
- if allowed_tags.match(tag):
- lemma += "/" + tag[:2]
- result.append(lemma.encode('utf8'))
- return result
-
-
- def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0):
- """Create a random gensim BoW vector, with the feature counts following the Poisson distribution.
-
- Parameters
- ----------
- dim : int, optional
- Dimension of vector.
- prob_nnz : float, optional
- Probability of each coordinate will be nonzero, will be drawn from the Poisson distribution.
- lam : float, optional
- Lambda parameter for the Poisson distribution.
-
- Returns
- -------
- list of (int, float)
- Vector in BoW format.
-
- """
- nnz = np.random.uniform(size=(dim,))
- return [(i, float(np.random.poisson(lam=lam) + 1.0)) for i in xrange(dim) if nnz[i] < prob_nnz]
-
-
- def mock_data(n_items=1000, dim=1000, prob_nnz=0.5, lam=1.0):
- """Create a random Gensim-style corpus (BoW), using :func:`~gensim.utils.mock_data_row`.
-
- Parameters
- ----------
- n_items : int
- Size of corpus
- dim : int
- Dimension of vector, used for :func:`~gensim.utils.mock_data_row`.
- prob_nnz : float, optional
- Probability of each coordinate will be nonzero, will be drawn from Poisson distribution,
- used for :func:`~gensim.utils.mock_data_row`.
- lam : float, optional
- Parameter for Poisson distribution, used for :func:`~gensim.utils.mock_data_row`.
-
- Returns
- -------
- list of list of (int, float)
- Gensim-style corpus.
-
- """
- return [mock_data_row(dim=dim, prob_nnz=prob_nnz, lam=lam) for _ in xrange(n_items)]
-
-
- def prune_vocab(vocab, min_reduce, trim_rule=None):
- """Remove all entries from the `vocab` dictionary with count smaller than `min_reduce`.
-
- Modifies `vocab` in place, returns the sum of all counts that were pruned.
-
- Parameters
- ----------
- vocab : dict
- Input dictionary.
- min_reduce : int
- Frequency threshold for tokens in `vocab`.
- trim_rule : function, optional
- Function for trimming entities from vocab, default behaviour is `vocab[w] <= min_reduce`.
-
- Returns
- -------
- result : int
- Sum of all counts that were pruned.
-
- """
- result = 0
- old_len = len(vocab)
- for w in list(vocab): # make a copy of dict's keys
- if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule): # vocab[w] <= min_reduce:
- result += vocab[w]
- del vocab[w]
- logger.info(
- "pruned out %i tokens with count <=%i (before %i, after %i)",
- old_len - len(vocab), min_reduce, old_len, len(vocab)
- )
- return result
-
-
- def trim_vocab_by_freq(vocab, topk, trim_rule=None):
- """Retain `topk` most frequent words in `vocab`.
- If there are more words with the same frequency as `topk`-th one, they will be kept.
- Modifies `vocab` in place, returns nothing.
-
- Parameters
- ----------
- vocab : dict
- Input dictionary.
- topk : int
- Number of words with highest frequencies to keep.
- trim_rule : function, optional
- Function for trimming entities from vocab, default behaviour is `vocab[w] <= min_count`.
-
- """
- if topk >= len(vocab):
- return
-
- min_count = heapq.nlargest(topk, itervalues(vocab))[-1]
- prune_vocab(vocab, min_count, trim_rule=trim_rule)
-
-
- def merge_counts(dict1, dict2):
- """Merge `dict1` of (word, freq1) and `dict2` of (word, freq2) into `dict1` of (word, freq1+freq2).
- Parameters
- ----------
- dict1 : dict of (str, int)
- First dictionary.
- dict2 : dict of (str, int)
- Second dictionary.
- Returns
- -------
- result : dict
- Merged dictionary with sum of frequencies as values.
- """
- for word, freq in iteritems(dict2):
- if word in dict1:
- dict1[word] += freq
- else:
- dict1[word] = freq
-
- return dict1
-
-
- def qsize(queue):
- """Get the (approximate) queue size where available.
-
- Parameters
- ----------
- queue : :class:`queue.Queue`
- Input queue.
-
- Returns
- -------
- int
- Queue size, -1 if `qsize` method isn't implemented (OS X).
-
- """
- try:
- return queue.qsize()
- except NotImplementedError:
- # OS X doesn't support qsize
- return -1
-
-
- RULE_DEFAULT = 0
- RULE_DISCARD = 1
- RULE_KEEP = 2
-
-
- def keep_vocab_item(word, count, min_count, trim_rule=None):
- """Should we keep `word` in the vocab or remove it?
-
- Parameters
- ----------
- word : str
- Input word.
- count : int
- Number of times that word appeared in a corpus.
- min_count : int
- Discard words with frequency smaller than this.
- trim_rule : function, optional
- Custom function to decide whether to keep or discard this word.
- If a custom `trim_rule` is not specified, the default behaviour is simply `count >= min_count`.
-
- Returns
- -------
- bool
- True if `word` should stay, False otherwise.
-
- """
- default_res = count >= min_count
-
- if trim_rule is None:
- return default_res
- else:
- rule_res = trim_rule(word, count, min_count)
- if rule_res == RULE_KEEP:
- return True
- elif rule_res == RULE_DISCARD:
- return False
- else:
- return default_res
-
-
- def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs):
- r"""Run OS command with the given arguments and return its output as a byte string.
-
- Backported from Python 2.7 with a few minor modifications. Widely used for :mod:`gensim.models.wrappers`.
- Behaves very similar to https://docs.python.org/2/library/subprocess.html#subprocess.check_output.
-
- Examples
- --------
- >>> from gensim.utils import check_output
- >>> check_output(args=['echo', '1'])
- '1\n'
-
- Raises
- ------
- KeyboardInterrupt
- If Ctrl+C pressed.
-
- """
- try:
- logger.debug("COMMAND: %s %s", popenargs, kwargs)
- process = subprocess.Popen(stdout=stdout, *popenargs, **kwargs)
- output, unused_err = process.communicate()
- retcode = process.poll()
- if retcode:
- cmd = kwargs.get("args")
- if cmd is None:
- cmd = popenargs[0]
- error = subprocess.CalledProcessError(retcode, cmd)
- error.output = output
- raise error
- return output
- except KeyboardInterrupt:
- process.terminate()
- raise
-
-
- def sample_dict(d, n=10, use_random=True):
- """Selected `n` (possibly random) items from the dictionary `d`.
-
- Parameters
- ----------
- d : dict
- Input dictionary.
- n : int, optional
- Number of items to select.
- use_random : bool, optional
- Select items randomly (without replacement), instead of by the natural dict iteration order?
-
- Returns
- -------
- list of (object, object)
- Selected items from dictionary, as a list.
-
- """
- selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(iterkeys(d), n)
- return [(key, d[key]) for key in selected_keys]
-
-
- def strided_windows(ndarray, window_size):
- """Produce a numpy.ndarray of windows, as from a sliding window.
-
- Parameters
- ----------
- ndarray : numpy.ndarray
- Input array
- window_size : int
- Sliding window size.
-
- Returns
- -------
- numpy.ndarray
- Subsequences produced by sliding a window of the given size over the `ndarray`.
- Since this uses striding, the individual arrays are views rather than copies of `ndarray`.
- Changes to one view modifies the others and the original.
-
- Examples
- --------
- >>> from gensim.utils import strided_windows
- >>> strided_windows(np.arange(5), 2)
- array([[0, 1],
- [1, 2],
- [2, 3],
- [3, 4]])
- >>> strided_windows(np.arange(10), 5)
- array([[0, 1, 2, 3, 4],
- [1, 2, 3, 4, 5],
- [2, 3, 4, 5, 6],
- [3, 4, 5, 6, 7],
- [4, 5, 6, 7, 8],
- [5, 6, 7, 8, 9]])
-
- """
- ndarray = np.asarray(ndarray)
- if window_size == ndarray.shape[0]:
- return np.array([ndarray])
- elif window_size > ndarray.shape[0]:
- return np.ndarray((0, 0))
-
- stride = ndarray.strides[0]
- return np.lib.stride_tricks.as_strided(
- ndarray, shape=(ndarray.shape[0] - window_size + 1, window_size),
- strides=(stride, stride))
-
-
- def iter_windows(texts, window_size, copy=False, ignore_below_size=True, include_doc_num=False):
- """Produce a generator over the given texts using a sliding window of `window_size`.
-
- The windows produced are views of some subsequence of a text.
- To use deep copies instead, pass `copy=True`.
-
- Parameters
- ----------
- texts : list of str
- List of string sentences.
- window_size : int
- Size of sliding window.
- copy : bool, optional
- Produce deep copies.
- ignore_below_size : bool, optional
- Ignore documents that are not at least `window_size` in length?
- include_doc_num : bool, optional
- Yield the text position with `texts` along with each window?
-
- """
- for doc_num, document in enumerate(texts):
- for window in _iter_windows(document, window_size, copy, ignore_below_size):
- if include_doc_num:
- yield (doc_num, window)
- else:
- yield window
-
-
- def _iter_windows(document, window_size, copy=False, ignore_below_size=True):
- doc_windows = strided_windows(document, window_size)
- if doc_windows.shape[0] == 0:
- if not ignore_below_size:
- yield document.copy() if copy else document
- else:
- for doc_window in doc_windows:
- yield doc_window.copy() if copy else doc_window
-
-
- def flatten(nested_list):
- """Recursively flatten a nested sequence of elements.
-
- Parameters
- ----------
- nested_list : iterable
- Possibly nested sequence of elements to flatten.
-
- Returns
- -------
- list
- Flattened version of `nested_list` where any elements that are an iterable (`collections.Iterable`)
- have been unpacked into the top-level list, in a recursive fashion.
-
- """
- return list(lazy_flatten(nested_list))
-
-
- def lazy_flatten(nested_list):
- """Lazy version of :func:`~gensim.utils.flatten`.
-
- Parameters
- ----------
- nested_list : list
- Possibly nested list.
-
- Yields
- ------
- object
- Element of list
-
- """
- for el in nested_list:
- if isinstance(el, collections.Iterable) and not isinstance(el, string_types):
- for sub in flatten(el):
- yield sub
- else:
- yield el
-
-
- def save_as_line_sentence(corpus, filename):
- """Save the corpus in LineSentence format, i.e. each sentence on a separate line,
- tokens are separated by space.
-
- Parameters
- ----------
- corpus : iterable of iterables of strings
-
- """
- with smart_open(filename, mode='wb', encoding='utf8') as fout:
- for sentence in corpus:
- line = any2unicode(' '.join(sentence) + '\n')
- fout.write(line)
-
-
- def effective_n_jobs(n_jobs):
- """Determines the number of jobs can run in parallel.
-
- Just like in sklearn, passing n_jobs=-1 means using all available
- CPU cores.
-
- Parameters
- ----------
- n_jobs : int
- Number of workers requested by caller.
-
- Returns
- -------
- int
- Number of effective jobs.
-
- """
- if n_jobs == 0:
- raise ValueError('n_jobs == 0 in Parallel has no meaning')
- elif n_jobs is None:
- return 1
- elif n_jobs < 0:
- n_jobs = max(cpu_count() + 1 + n_jobs, 1)
- return n_jobs
|