You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2070 lines
63 KiB

4 years ago
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
  5. # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  6. """Various general utility functions."""
  7. from __future__ import with_statement
  8. from contextlib import contextmanager
  9. import collections
  10. import logging
  11. import warnings
  12. try:
  13. from html.entities import name2codepoint as n2cp
  14. except ImportError:
  15. from htmlentitydefs import name2codepoint as n2cp
  16. try:
  17. import cPickle as _pickle
  18. except ImportError:
  19. import pickle as _pickle
  20. import re
  21. import unicodedata
  22. import os
  23. import random
  24. import itertools
  25. import tempfile
  26. from functools import wraps
  27. import multiprocessing
  28. import shutil
  29. import sys
  30. import subprocess
  31. import inspect
  32. import heapq
  33. import numpy as np
  34. import numbers
  35. import scipy.sparse
  36. from six import iterkeys, iteritems, itervalues, u, string_types, unichr
  37. from six.moves import xrange
  38. from smart_open import smart_open
  39. from multiprocessing import cpu_count
  40. if sys.version_info[0] >= 3:
  41. unicode = str
  42. logger = logging.getLogger(__name__)
  43. PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE)
  44. RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)
  45. def get_random_state(seed):
  46. """Generate :class:`numpy.random.RandomState` based on input seed.
  47. Parameters
  48. ----------
  49. seed : {None, int, array_like}
  50. Seed for random state.
  51. Returns
  52. -------
  53. :class:`numpy.random.RandomState`
  54. Random state.
  55. Raises
  56. ------
  57. AttributeError
  58. If seed is not {None, int, array_like}.
  59. Notes
  60. -----
  61. Method originally from `maciejkula/glove-python <https://github.com/maciejkula/glove-python>`_
  62. and written by `@joshloyal <https://github.com/joshloyal>`_.
  63. """
  64. if seed is None or seed is np.random:
  65. return np.random.mtrand._rand
  66. if isinstance(seed, (numbers.Integral, np.integer)):
  67. return np.random.RandomState(seed)
  68. if isinstance(seed, np.random.RandomState):
  69. return seed
  70. raise ValueError('%r cannot be used to seed a np.random.RandomState instance' % seed)
  71. def synchronous(tlockname):
  72. """A decorator to place an instance-based lock around a method.
  73. Notes
  74. -----
  75. Adapted from http://code.activestate.com/recipes/577105-synchronization-decorator-for-class-methods/.
  76. """
  77. def _synched(func):
  78. @wraps(func)
  79. def _synchronizer(self, *args, **kwargs):
  80. tlock = getattr(self, tlockname)
  81. logger.debug("acquiring lock %r for %s", tlockname, func.__name__)
  82. with tlock: # use lock as a context manager to perform safe acquire/release pairs
  83. logger.debug("acquired lock %r for %s", tlockname, func.__name__)
  84. result = func(self, *args, **kwargs)
  85. logger.debug("releasing lock %r for %s", tlockname, func.__name__)
  86. return result
  87. return _synchronizer
  88. return _synched
  89. def file_or_filename(input):
  90. """Open a filename for reading with `smart_open`, or seek to the beginning if `input` is an already open file.
  91. Parameters
  92. ----------
  93. input : str or file-like
  94. Filename or file-like object.
  95. Returns
  96. -------
  97. file-like object
  98. An open file, positioned at the beginning.
  99. """
  100. if isinstance(input, string_types):
  101. # input was a filename: open as file
  102. return smart_open(input)
  103. else:
  104. # input already a file-like object; just reset to the beginning
  105. input.seek(0)
  106. return input
  107. @contextmanager
  108. def open_file(input):
  109. """Provide "with-like" behaviour without closing the file object.
  110. Parameters
  111. ----------
  112. input : str or file-like
  113. Filename or file-like object.
  114. Yields
  115. -------
  116. file
  117. File-like object based on input (or input if this already file-like).
  118. """
  119. mgr = file_or_filename(input)
  120. exc = False
  121. try:
  122. yield mgr
  123. except Exception:
  124. # Handling any unhandled exceptions from the code nested in 'with' statement.
  125. exc = True
  126. if not isinstance(input, string_types) or not mgr.__exit__(*sys.exc_info()):
  127. raise
  128. # Try to introspect and silence errors.
  129. finally:
  130. if not exc and isinstance(input, string_types):
  131. mgr.__exit__(None, None, None)
  132. def deaccent(text):
  133. """Remove letter accents from the given string.
  134. Parameters
  135. ----------
  136. text : str
  137. Input string.
  138. Returns
  139. -------
  140. str
  141. Unicode string without accents.
  142. Examples
  143. --------
  144. >>> from gensim.utils import deaccent
  145. >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
  146. u'Sef chomutovskych komunistu dostal postou bily prasek'
  147. """
  148. if not isinstance(text, unicode):
  149. # assume utf8 for byte strings, use default (strict) error handling
  150. text = text.decode('utf8')
  151. norm = unicodedata.normalize("NFD", text)
  152. result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
  153. return unicodedata.normalize("NFC", result)
  154. def copytree_hardlink(source, dest):
  155. """Recursively copy a directory ala shutils.copytree, but hardlink files instead of copying.
  156. Parameters
  157. ----------
  158. source : str
  159. Path to source directory
  160. dest : str
  161. Path to destination directory
  162. Warnings
  163. --------
  164. Available on UNIX systems only.
  165. """
  166. copy2 = shutil.copy2
  167. try:
  168. shutil.copy2 = os.link
  169. shutil.copytree(source, dest)
  170. finally:
  171. shutil.copy2 = copy2
  172. def tokenize(text, lowercase=False, deacc=False, encoding='utf8', errors="strict", to_lower=False, lower=False):
  173. """Iteratively yield tokens as unicode strings, optionally removing accent marks and lowercasing it.
  174. Parameters
  175. ----------
  176. text : str or bytes
  177. Input string.
  178. deacc : bool, optional
  179. Remove accentuation using :func:`~gensim.utils.deaccent`?
  180. encoding : str, optional
  181. Encoding of input string, used as parameter for :func:`~gensim.utils.to_unicode`.
  182. errors : str, optional
  183. Error handling behaviour, used as parameter for :func:`~gensim.utils.to_unicode`.
  184. lowercase : bool, optional
  185. Lowercase the input string?
  186. to_lower : bool, optional
  187. Same as `lowercase`. Convenience alias.
  188. lower : bool, optional
  189. Same as `lowercase`. Convenience alias.
  190. Yields
  191. ------
  192. str
  193. Contiguous sequences of alphabetic characters (no digits!), using :func:`~gensim.utils.simple_tokenize`
  194. Examples
  195. --------
  196. >>> from gensim.utils import tokenize
  197. >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc=True))
  198. [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']
  199. """
  200. lowercase = lowercase or to_lower or lower
  201. text = to_unicode(text, encoding, errors=errors)
  202. if lowercase:
  203. text = text.lower()
  204. if deacc:
  205. text = deaccent(text)
  206. return simple_tokenize(text)
  207. def simple_tokenize(text):
  208. """Tokenize input test using :const:`gensim.utils.PAT_ALPHABETIC`.
  209. Parameters
  210. ----------
  211. text : str
  212. Input text.
  213. Yields
  214. ------
  215. str
  216. Tokens from `text`.
  217. """
  218. for match in PAT_ALPHABETIC.finditer(text):
  219. yield match.group()
  220. def simple_preprocess(doc, deacc=False, min_len=2, max_len=15):
  221. """Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
  222. Uses :func:`~gensim.utils.tokenize` internally.
  223. Parameters
  224. ----------
  225. doc : str
  226. Input document.
  227. deacc : bool, optional
  228. Remove accent marks from tokens using :func:`~gensim.utils.deaccent`?
  229. min_len : int, optional
  230. Minimum length of token (inclusive). Shorter tokens are discarded.
  231. max_len : int, optional
  232. Maximum length of token in result (inclusive). Longer tokens are discarded.
  233. Returns
  234. -------
  235. list of str
  236. Tokens extracted from `doc`.
  237. """
  238. tokens = [
  239. token for token in tokenize(doc, lower=True, deacc=deacc, errors='ignore')
  240. if min_len <= len(token) <= max_len and not token.startswith('_')
  241. ]
  242. return tokens
  243. def any2utf8(text, errors='strict', encoding='utf8'):
  244. """Convert a unicode or bytes string in the given encoding into a utf8 bytestring.
  245. Parameters
  246. ----------
  247. text : str
  248. Input text.
  249. errors : str, optional
  250. Error handling behaviour if `text` is a bytestring.
  251. encoding : str, optional
  252. Encoding of `text` if it is a bytestring.
  253. Returns
  254. -------
  255. str
  256. Bytestring in utf8.
  257. """
  258. if isinstance(text, unicode):
  259. return text.encode('utf8')
  260. # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8
  261. return unicode(text, encoding, errors=errors).encode('utf8')
  262. to_utf8 = any2utf8
  263. def any2unicode(text, encoding='utf8', errors='strict'):
  264. """Convert `text` (bytestring in given encoding or unicode) to unicode.
  265. Parameters
  266. ----------
  267. text : str
  268. Input text.
  269. errors : str, optional
  270. Error handling behaviour if `text` is a bytestring.
  271. encoding : str, optional
  272. Encoding of `text` if it is a bytestring.
  273. Returns
  274. -------
  275. str
  276. Unicode version of `text`.
  277. """
  278. if isinstance(text, unicode):
  279. return text
  280. return unicode(text, encoding, errors=errors)
  281. to_unicode = any2unicode
  282. def call_on_class_only(*args, **kwargs):
  283. """Helper to raise `AttributeError` if a class method is called on an instance. Used internally.
  284. Parameters
  285. ----------
  286. *args
  287. Variable length argument list.
  288. **kwargs
  289. Arbitrary keyword arguments.
  290. Raises
  291. ------
  292. AttributeError
  293. If a class method is called on an instance.
  294. """
  295. raise AttributeError('This method should be called on a class object.')
  296. class SaveLoad(object):
  297. """Serialize/deserialize object from disk, by equipping objects with the save()/load() methods.
  298. Warnings
  299. --------
  300. This uses pickle internally (among other techniques), so objects must not contain unpicklable attributes
  301. such as lambda functions etc.
  302. """
  303. @classmethod
  304. def load(cls, fname, mmap=None):
  305. """Load an object previously saved using :meth:`~gensim.utils.SaveLoad.save` from a file.
  306. Parameters
  307. ----------
  308. fname : str
  309. Path to file that contains needed object.
  310. mmap : str, optional
  311. Memory-map option. If the object was saved with large arrays stored separately, you can load these arrays
  312. via mmap (shared memory) using `mmap='r'.
  313. If the file being loaded is compressed (either '.gz' or '.bz2'), then `mmap=None` **must be** set.
  314. See Also
  315. --------
  316. :meth:`~gensim.utils.SaveLoad.save`
  317. Save object to file.
  318. Returns
  319. -------
  320. object
  321. Object loaded from `fname`.
  322. Raises
  323. ------
  324. AttributeError
  325. When called on an object instance instead of class (this is a class method).
  326. """
  327. logger.info("loading %s object from %s", cls.__name__, fname)
  328. compress, subname = SaveLoad._adapt_by_suffix(fname)
  329. obj = unpickle(fname)
  330. obj._load_specials(fname, mmap, compress, subname)
  331. logger.info("loaded %s", fname)
  332. return obj
  333. def _load_specials(self, fname, mmap, compress, subname):
  334. """Load attributes that were stored separately, and give them the same opportunity
  335. to recursively load using the :class:`~gensim.utils.SaveLoad` interface.
  336. Parameters
  337. ----------
  338. fname : str
  339. Input file path.
  340. mmap : {None, r+, r, w+, c}
  341. Memory-map options. See `numpy.load(mmap_mode)
  342. <https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.load.html>`_.
  343. compress : bool
  344. Is the input file compressed?
  345. subname : str
  346. Attribute name. Set automatically during recursive processing.
  347. """
  348. def mmap_error(obj, filename):
  349. return IOError(
  350. 'Cannot mmap compressed object %s in file %s. ' % (obj, filename) +
  351. 'Use `load(fname, mmap=None)` or uncompress files manually.'
  352. )
  353. for attrib in getattr(self, '__recursive_saveloads', []):
  354. cfname = '.'.join((fname, attrib))
  355. logger.info("loading %s recursively from %s.* with mmap=%s", attrib, cfname, mmap)
  356. getattr(self, attrib)._load_specials(cfname, mmap, compress, subname)
  357. for attrib in getattr(self, '__numpys', []):
  358. logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap)
  359. if compress:
  360. if mmap:
  361. raise mmap_error(attrib, subname(fname, attrib))
  362. val = np.load(subname(fname, attrib))['val']
  363. else:
  364. val = np.load(subname(fname, attrib), mmap_mode=mmap)
  365. setattr(self, attrib, val)
  366. for attrib in getattr(self, '__scipys', []):
  367. logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap)
  368. sparse = unpickle(subname(fname, attrib))
  369. if compress:
  370. if mmap:
  371. raise mmap_error(attrib, subname(fname, attrib))
  372. with np.load(subname(fname, attrib, 'sparse')) as f:
  373. sparse.data = f['data']
  374. sparse.indptr = f['indptr']
  375. sparse.indices = f['indices']
  376. else:
  377. sparse.data = np.load(subname(fname, attrib, 'data'), mmap_mode=mmap)
  378. sparse.indptr = np.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap)
  379. sparse.indices = np.load(subname(fname, attrib, 'indices'), mmap_mode=mmap)
  380. setattr(self, attrib, sparse)
  381. for attrib in getattr(self, '__ignoreds', []):
  382. logger.info("setting ignored attribute %s to None", attrib)
  383. setattr(self, attrib, None)
  384. @staticmethod
  385. def _adapt_by_suffix(fname):
  386. """Get compress setting and filename for numpy file compression.
  387. Parameters
  388. ----------
  389. fname : str
  390. Input filename.
  391. Returns
  392. -------
  393. (bool, function)
  394. First argument will be True if `fname` compressed.
  395. """
  396. compress, suffix = (True, 'npz') if fname.endswith('.gz') or fname.endswith('.bz2') else (False, 'npy')
  397. return compress, lambda *args: '.'.join(args + (suffix,))
  398. def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2):
  399. """Save the object to a file. Used internally by :meth:`gensim.utils.SaveLoad.save()`.
  400. Parameters
  401. ----------
  402. fname : str
  403. Path to file.
  404. separately : list, optional
  405. Iterable of attributes than need to store distinctly.
  406. sep_limit : int, optional
  407. Limit for separation.
  408. ignore : frozenset, optional
  409. Attributes that shouldn't be store.
  410. pickle_protocol : int, optional
  411. Protocol number for pickle.
  412. Notes
  413. -----
  414. If `separately` is None, automatically detect large numpy/scipy.sparse arrays in the object being stored,
  415. and store them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays back
  416. on load efficiently.
  417. You can also set `separately` manually, in which case it must be a list of attribute names to be stored
  418. in separate files. The automatic check is not performed in this case.
  419. """
  420. logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately)
  421. compress, subname = SaveLoad._adapt_by_suffix(fname)
  422. restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol,
  423. compress, subname)
  424. try:
  425. pickle(self, fname, protocol=pickle_protocol)
  426. finally:
  427. # restore attribs handled specially
  428. for obj, asides in restores:
  429. for attrib, val in iteritems(asides):
  430. setattr(obj, attrib, val)
  431. logger.info("saved %s", fname)
  432. def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname):
  433. """Save aside any attributes that need to be handled separately, including
  434. by recursion any attributes that are themselves :class:`~gensim.utils.SaveLoad` instances.
  435. Parameters
  436. ----------
  437. fname : str
  438. Output filename.
  439. separately : list or None
  440. List of attributes to store separately.
  441. sep_limit : int
  442. Don't store arrays smaller than this separately. In bytes.
  443. ignore : iterable of str
  444. Attributes that shouldn't be stored at all.
  445. pickle_protocol : int
  446. Protocol number for pickle.
  447. compress : bool
  448. If True - compress output with :func:`numpy.savez_compressed`.
  449. subname : function
  450. Produced by :meth:`~gensim.utils.SaveLoad._adapt_by_suffix`
  451. Returns
  452. -------
  453. list of (obj, {attrib: value, ...})
  454. Settings that the caller should use to restore each object's attributes that were set aside
  455. during the default :func:`~gensim.utils.pickle`.
  456. """
  457. asides = {}
  458. sparse_matrices = (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)
  459. if separately is None:
  460. separately = []
  461. for attrib, val in iteritems(self.__dict__):
  462. if isinstance(val, np.ndarray) and val.size >= sep_limit:
  463. separately.append(attrib)
  464. elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit:
  465. separately.append(attrib)
  466. # whatever's in `separately` or `ignore` at this point won't get pickled
  467. for attrib in separately + list(ignore):
  468. if hasattr(self, attrib):
  469. asides[attrib] = getattr(self, attrib)
  470. delattr(self, attrib)
  471. recursive_saveloads = []
  472. restores = []
  473. for attrib, val in iteritems(self.__dict__):
  474. if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading
  475. recursive_saveloads.append(attrib)
  476. cfname = '.'.join((fname, attrib))
  477. restores.extend(val._save_specials(cfname, None, sep_limit, ignore, pickle_protocol, compress, subname))
  478. try:
  479. numpys, scipys, ignoreds = [], [], []
  480. for attrib, val in iteritems(asides):
  481. if isinstance(val, np.ndarray) and attrib not in ignore:
  482. numpys.append(attrib)
  483. logger.info("storing np array '%s' to %s", attrib, subname(fname, attrib))
  484. if compress:
  485. np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val))
  486. else:
  487. np.save(subname(fname, attrib), np.ascontiguousarray(val))
  488. elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore:
  489. scipys.append(attrib)
  490. logger.info("storing scipy.sparse array '%s' under %s", attrib, subname(fname, attrib))
  491. if compress:
  492. np.savez_compressed(
  493. subname(fname, attrib, 'sparse'),
  494. data=val.data,
  495. indptr=val.indptr,
  496. indices=val.indices
  497. )
  498. else:
  499. np.save(subname(fname, attrib, 'data'), val.data)
  500. np.save(subname(fname, attrib, 'indptr'), val.indptr)
  501. np.save(subname(fname, attrib, 'indices'), val.indices)
  502. data, indptr, indices = val.data, val.indptr, val.indices
  503. val.data, val.indptr, val.indices = None, None, None
  504. try:
  505. # store array-less object
  506. pickle(val, subname(fname, attrib), protocol=pickle_protocol)
  507. finally:
  508. val.data, val.indptr, val.indices = data, indptr, indices
  509. else:
  510. logger.info("not storing attribute %s", attrib)
  511. ignoreds.append(attrib)
  512. self.__dict__['__numpys'] = numpys
  513. self.__dict__['__scipys'] = scipys
  514. self.__dict__['__ignoreds'] = ignoreds
  515. self.__dict__['__recursive_saveloads'] = recursive_saveloads
  516. except Exception:
  517. # restore the attributes if exception-interrupted
  518. for attrib, val in iteritems(asides):
  519. setattr(self, attrib, val)
  520. raise
  521. return restores + [(self, asides)]
  522. def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2):
  523. """Save the object to a file.
  524. Parameters
  525. ----------
  526. fname_or_handle : str or file-like
  527. Path to output file or already opened file-like object. If the object is a file handle,
  528. no special array handling will be performed, all attributes will be saved to the same file.
  529. separately : list of str or None, optional
  530. If None, automatically detect large numpy/scipy.sparse arrays in the object being stored, and store
  531. them into separate files. This prevent memory errors for large objects, and also allows
  532. `memory-mapping <https://en.wikipedia.org/wiki/Mmap>`_ the large arrays for efficient
  533. loading and sharing the large arrays in RAM between multiple processes.
  534. If list of str: store these attributes into separate files. The automated size check
  535. is not performed in this case.
  536. sep_limit : int, optional
  537. Don't store arrays smaller than this separately. In bytes.
  538. ignore : frozenset of str, optional
  539. Attributes that shouldn't be stored at all.
  540. pickle_protocol : int, optional
  541. Protocol number for pickle.
  542. See Also
  543. --------
  544. :meth:`~gensim.utils.SaveLoad.load`
  545. Load object from file.
  546. """
  547. try:
  548. _pickle.dump(self, fname_or_handle, protocol=pickle_protocol)
  549. logger.info("saved %s object", self.__class__.__name__)
  550. except TypeError: # `fname_or_handle` does not have write attribute
  551. self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol)
  552. def identity(p):
  553. """Identity fnc, for flows that don't accept lambda (pickling etc).
  554. Parameters
  555. ----------
  556. p : object
  557. Input parameter.
  558. Returns
  559. -------
  560. object
  561. Same as `p`.
  562. """
  563. return p
  564. def get_max_id(corpus):
  565. """Get the highest feature id that appears in the corpus.
  566. Parameters
  567. ----------
  568. corpus : iterable of iterable of (int, numeric)
  569. Collection of texts in BoW format.
  570. Returns
  571. ------
  572. int
  573. Highest feature id.
  574. Notes
  575. -----
  576. For empty `corpus` return -1.
  577. """
  578. maxid = -1
  579. for document in corpus:
  580. maxid = max(maxid, max([-1] + [fieldid for fieldid, _ in document])) # [-1] to avoid exceptions from max(empty)
  581. return maxid
  582. class FakeDict(object):
  583. """Objects of this class act as dictionaries that map integer->str(integer), for a specified
  584. range of integers <0, num_terms).
  585. This is meant to avoid allocating real dictionaries when `num_terms` is huge, which is a waste of memory.
  586. """
  587. def __init__(self, num_terms):
  588. """
  589. Parameters
  590. ----------
  591. num_terms : int
  592. Number of terms.
  593. """
  594. self.num_terms = num_terms
  595. def __str__(self):
  596. return "FakeDict(num_terms=%s)" % self.num_terms
  597. def __getitem__(self, val):
  598. if 0 <= val < self.num_terms:
  599. return str(val)
  600. raise ValueError("internal id out of bounds (%s, expected <0..%s))" % (val, self.num_terms))
  601. def iteritems(self):
  602. """Iterate over all keys and values.
  603. Yields
  604. ------
  605. (int, str)
  606. Pair of (id, token).
  607. """
  608. for i in xrange(self.num_terms):
  609. yield i, str(i)
  610. def keys(self):
  611. """Override the `dict.keys()`, which is used to determine the maximum internal id of a corpus,
  612. i.e. the vocabulary dimensionality.
  613. Returns
  614. -------
  615. list of int
  616. Highest id, packed in list.
  617. Notes
  618. -----
  619. To avoid materializing the whole `range(0, self.num_terms)`,
  620. this returns the highest id = `[self.num_terms - 1]` only.
  621. """
  622. return [self.num_terms - 1]
  623. def __len__(self):
  624. return self.num_terms
  625. def get(self, val, default=None):
  626. if 0 <= val < self.num_terms:
  627. return str(val)
  628. return default
  629. def dict_from_corpus(corpus):
  630. """Scan corpus for all word ids that appear in it, then construct a mapping
  631. which maps each `word_id` -> `str(word_id)`.
  632. Parameters
  633. ----------
  634. corpus : iterable of iterable of (int, numeric)
  635. Collection of texts in BoW format.
  636. Returns
  637. ------
  638. id2word : :class:`~gensim.utils.FakeDict`
  639. "Fake" mapping which maps each `word_id` -> `str(word_id)`.
  640. Warnings
  641. --------
  642. This function is used whenever *words* need to be displayed (as opposed to just their ids)
  643. but no `word_id` -> `word` mapping was provided. The resulting mapping only covers words actually
  644. used in the corpus, up to the highest `word_id` found.
  645. """
  646. num_terms = 1 + get_max_id(corpus)
  647. id2word = FakeDict(num_terms)
  648. return id2word
  649. def is_corpus(obj):
  650. """Check whether `obj` is a corpus, by peeking at its first element. Works even on streamed generators.
  651. The peeked element is put back into a object returned by this function, so always use
  652. that returned object instead of the original `obj`.
  653. Parameters
  654. ----------
  655. obj : object
  656. An `iterable of iterable` that contains (int, numeric).
  657. Returns
  658. -------
  659. (bool, object)
  660. Pair of (is `obj` a corpus, `obj` with peeked element restored)
  661. Examples
  662. --------
  663. >>> from gensim.utils import is_corpus
  664. >>> corpus = [[(1, 1.0)], [(2, -0.3), (3, 0.12)]]
  665. >>> corpus_or_not, corpus = is_corpus(corpus)
  666. Warnings
  667. --------
  668. An "empty" corpus (empty input sequence) is ambiguous, so in this case
  669. the result is forcefully defined as (False, `obj`).
  670. """
  671. try:
  672. if 'Corpus' in obj.__class__.__name__: # the most common case, quick hack
  673. return True, obj
  674. except Exception:
  675. pass
  676. try:
  677. if hasattr(obj, 'next') or hasattr(obj, '__next__'):
  678. # the input is an iterator object, meaning once we call next()
  679. # that element could be gone forever. we must be careful to put
  680. # whatever we retrieve back again
  681. doc1 = next(obj)
  682. obj = itertools.chain([doc1], obj)
  683. else:
  684. doc1 = next(iter(obj)) # empty corpus is resolved to False here
  685. if len(doc1) == 0: # sparse documents must have a __len__ function (list, tuple...)
  686. return True, obj # the first document is empty=>assume this is a corpus
  687. # if obj is a 1D numpy array(scalars) instead of 2-tuples, it resolves to False here
  688. id1, val1 = next(iter(doc1))
  689. id1, val1 = int(id1), float(val1) # must be a 2-tuple (integer, float)
  690. except Exception:
  691. return False, obj
  692. return True, obj
  693. def get_my_ip():
  694. """Try to obtain our external ip (from the Pyro4 nameserver's point of view)
  695. Returns
  696. -------
  697. str
  698. IP address.
  699. Warnings
  700. --------
  701. This tries to sidestep the issue of bogus `/etc/hosts` entries and other local misconfiguration,
  702. which often mess up hostname resolution.
  703. If all else fails, fall back to simple `socket.gethostbyname()` lookup.
  704. """
  705. import socket
  706. try:
  707. from Pyro4.naming import locateNS
  708. # we know the nameserver must exist, so use it as our anchor point
  709. ns = locateNS()
  710. s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
  711. s.connect((ns._pyroUri.host, ns._pyroUri.port))
  712. result, port = s.getsockname()
  713. except Exception:
  714. try:
  715. # see what ifconfig says about our default interface
  716. import commands
  717. result = commands.getoutput("ifconfig").split("\n")[1].split()[1][5:]
  718. if len(result.split('.')) != 4:
  719. raise Exception()
  720. except Exception:
  721. # give up, leave the resolution to gethostbyname
  722. result = socket.gethostbyname(socket.gethostname())
  723. return result
  724. class RepeatCorpus(SaveLoad):
  725. """Wrap a `corpus` as another corpus of length `reps`. This is achieved by repeating documents from `corpus`
  726. over and over again, until the requested length `len(result) == reps` is reached.
  727. Repetition is done on-the-fly=efficiently, via `itertools`.
  728. Examples
  729. --------
  730. >>> from gensim.utils import RepeatCorpus
  731. >>>
  732. >>> corpus = [[(1, 2)], []] # 2 documents
  733. >>> list(RepeatCorpus(corpus, 5)) # repeat 2.5 times to get 5 documents
  734. [[(1, 2)], [], [(1, 2)], [], [(1, 2)]]
  735. """
  736. def __init__(self, corpus, reps):
  737. """
  738. Parameters
  739. ----------
  740. corpus : iterable of iterable of (int, numeric)
  741. Input corpus.
  742. reps : int
  743. Number of repeats for documents from corpus.
  744. """
  745. self.corpus = corpus
  746. self.reps = reps
  747. def __iter__(self):
  748. return itertools.islice(itertools.cycle(self.corpus), self.reps)
  749. class RepeatCorpusNTimes(SaveLoad):
  750. """Wrap a `corpus` and repeat it `n` times.
  751. Examples
  752. --------
  753. >>> from gensim.utils import RepeatCorpusNTimes
  754. >>>
  755. >>> corpus = [[(1, 0.5)], []]
  756. >>> list(RepeatCorpusNTimes(corpus, 3)) # repeat 3 times
  757. [[(1, 0.5)], [], [(1, 0.5)], [], [(1, 0.5)], []]
  758. """
  759. def __init__(self, corpus, n):
  760. """
  761. Parameters
  762. ----------
  763. corpus : iterable of iterable of (int, numeric)
  764. Input corpus.
  765. n : int
  766. Number of repeats for corpus.
  767. """
  768. self.corpus = corpus
  769. self.n = n
  770. def __iter__(self):
  771. for _ in xrange(self.n):
  772. for document in self.corpus:
  773. yield document
  774. class ClippedCorpus(SaveLoad):
  775. """Wrap a `corpus` and return `max_doc` element from it."""
  776. def __init__(self, corpus, max_docs=None):
  777. """
  778. Parameters
  779. ----------
  780. corpus : iterable of iterable of (int, numeric)
  781. Input corpus.
  782. max_docs : int
  783. Maximum number of documents in the wrapped corpus.
  784. Warnings
  785. --------
  786. Any documents after `max_docs` are ignored. This effectively limits the length of the returned corpus
  787. to <= `max_docs`. Set `max_docs=None` for "no limit", effectively wrapping the entire input corpus.
  788. """
  789. self.corpus = corpus
  790. self.max_docs = max_docs
  791. def __iter__(self):
  792. return itertools.islice(self.corpus, self.max_docs)
  793. def __len__(self):
  794. return min(self.max_docs, len(self.corpus))
  795. class SlicedCorpus(SaveLoad):
  796. """Wrap `corpus` and return a slice of it."""
  797. def __init__(self, corpus, slice_):
  798. """
  799. Parameters
  800. ----------
  801. corpus : iterable of iterable of (int, numeric)
  802. Input corpus.
  803. slice_ : slice or iterable
  804. Slice for `corpus`.
  805. Notes
  806. -----
  807. Negative slicing can only be used if the corpus is indexable, otherwise, the corpus will be iterated over.
  808. Slice can also be a np.ndarray to support fancy indexing.
  809. Calculating the size of a SlicedCorpus is expensive when using a slice as the corpus has
  810. to be iterated over once. Using a list or np.ndarray does not have this drawback, but consumes more memory.
  811. """
  812. self.corpus = corpus
  813. self.slice_ = slice_
  814. self.length = None
  815. def __iter__(self):
  816. if hasattr(self.corpus, 'index') and len(self.corpus.index) > 0:
  817. return (self.corpus.docbyoffset(i) for i in self.corpus.index[self.slice_])
  818. return itertools.islice(self.corpus, self.slice_.start, self.slice_.stop, self.slice_.step)
  819. def __len__(self):
  820. # check cached length, calculate if needed
  821. if self.length is None:
  822. if isinstance(self.slice_, (list, np.ndarray)):
  823. self.length = len(self.slice_)
  824. elif isinstance(self.slice_, slice):
  825. (start, end, step) = self.slice_.indices(len(self.corpus.index))
  826. diff = end - start
  827. self.length = diff // step + (diff % step > 0)
  828. else:
  829. self.length = sum(1 for x in self)
  830. return self.length
  831. def safe_unichr(intval):
  832. """Create a unicode character from its integer value. In case `unichr` fails, render the character
  833. as an escaped `\\U<8-byte hex value of intval>` string.
  834. Parameters
  835. ----------
  836. intval : int
  837. Integer code of character
  838. Returns
  839. -------
  840. string
  841. Unicode string of character
  842. """
  843. try:
  844. return unichr(intval)
  845. except ValueError:
  846. # ValueError: unichr() arg not in range(0x10000) (narrow Python build)
  847. s = "\\U%08x" % intval
  848. # return UTF16 surrogate pair
  849. return s.decode('unicode-escape')
  850. def decode_htmlentities(text):
  851. """Decode all HTML entities in text that are encoded as hex, decimal or named entities.
  852. Adapted from `python-twitter-ircbot/html_decode.py
  853. <http://github.com/sku/python-twitter-ircbot/blob/321d94e0e40d0acc92f5bf57d126b57369da70de/html_decode.py>`_.
  854. Parameters
  855. ----------
  856. text : str
  857. Input HTML.
  858. Examples
  859. --------
  860. >>> from gensim.utils import decode_htmlentities
  861. >>>
  862. >>> u = u'E tu vivrai nel terrore - L&#x27;aldil&#xE0; (1981)'
  863. >>> print(decode_htmlentities(u).encode('UTF-8'))
  864. E tu vivrai nel terrore - L'aldilà (1981)
  865. >>> print(decode_htmlentities("l&#39;eau"))
  866. l'eau
  867. >>> print(decode_htmlentities("foo &lt; bar"))
  868. foo < bar
  869. """
  870. def substitute_entity(match):
  871. try:
  872. ent = match.group(3)
  873. if match.group(1) == "#":
  874. # decoding by number
  875. if match.group(2) == '':
  876. # number is in decimal
  877. return safe_unichr(int(ent))
  878. elif match.group(2) in ['x', 'X']:
  879. # number is in hex
  880. return safe_unichr(int(ent, 16))
  881. else:
  882. # they were using a name
  883. cp = n2cp.get(ent)
  884. if cp:
  885. return safe_unichr(cp)
  886. else:
  887. return match.group()
  888. except Exception:
  889. # in case of errors, return original input
  890. return match.group()
  891. return RE_HTML_ENTITY.sub(substitute_entity, text)
  892. def chunkize_serial(iterable, chunksize, as_numpy=False, dtype=np.float32):
  893. """Yield elements from `iterable` in "chunksize"-ed groups.
  894. The last returned element may be smaller if the length of collection is not divisible by `chunksize`.
  895. Parameters
  896. ----------
  897. iterable : iterable of object
  898. An iterable.
  899. chunksize : int
  900. Split iterable into chunks of this size.
  901. as_numpy : bool, optional
  902. Yield chunks as `np.ndarray` instead of lists.
  903. Yields
  904. ------
  905. list OR np.ndarray
  906. "chunksize"-ed chunks of elements from `iterable`.
  907. Examples
  908. --------
  909. >>> print(list(grouper(range(10), 3)))
  910. [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
  911. """
  912. it = iter(iterable)
  913. while True:
  914. if as_numpy:
  915. # convert each document to a 2d numpy array (~6x faster when transmitting
  916. # chunk data over the wire, in Pyro)
  917. wrapped_chunk = [[np.array(doc, dtype=dtype) for doc in itertools.islice(it, int(chunksize))]]
  918. else:
  919. wrapped_chunk = [list(itertools.islice(it, int(chunksize)))]
  920. if not wrapped_chunk[0]:
  921. break
  922. # memory opt: wrap the chunk and then pop(), to avoid leaving behind a dangling reference
  923. yield wrapped_chunk.pop()
  924. grouper = chunkize_serial
  925. class InputQueue(multiprocessing.Process):
  926. """Populate a queue of input chunks from a streamed corpus.
  927. Useful for reading and chunking corpora in the background, in a separate process,
  928. so that workers that use the queue are not starved for input chunks.
  929. """
  930. def __init__(self, q, corpus, chunksize, maxsize, as_numpy):
  931. """
  932. Parameters
  933. ----------
  934. q : multiprocessing.Queue
  935. Enqueue chunks into this queue.
  936. corpus : iterable of iterable of (int, numeric)
  937. Corpus to read and split into "chunksize"-ed groups
  938. chunksize : int
  939. Split `corpus` into chunks of this size.
  940. as_numpy : bool, optional
  941. Enqueue chunks as `numpy.ndarray` instead of lists.
  942. """
  943. super(InputQueue, self).__init__()
  944. self.q = q
  945. self.maxsize = maxsize
  946. self.corpus = corpus
  947. self.chunksize = chunksize
  948. self.as_numpy = as_numpy
  949. def run(self):
  950. it = iter(self.corpus)
  951. while True:
  952. chunk = itertools.islice(it, self.chunksize)
  953. if self.as_numpy:
  954. # HACK XXX convert documents to numpy arrays, to save memory.
  955. # This also gives a scipy warning at runtime:
  956. # "UserWarning: indices array has non-integer dtype (float64)"
  957. wrapped_chunk = [[np.asarray(doc) for doc in chunk]]
  958. else:
  959. wrapped_chunk = [list(chunk)]
  960. if not wrapped_chunk[0]:
  961. self.q.put(None, block=True)
  962. break
  963. try:
  964. qsize = self.q.qsize()
  965. except NotImplementedError:
  966. qsize = '?'
  967. logger.debug("prepared another chunk of %i documents (qsize=%s)", len(wrapped_chunk[0]), qsize)
  968. self.q.put(wrapped_chunk.pop(), block=True)
  969. if os.name == 'nt':
  970. warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
  971. def chunkize(corpus, chunksize, maxsize=0, as_numpy=False):
  972. """Split `corpus` into fixed-sized chunks, using :func:`~gensim.utils.chunkize_serial`.
  973. Parameters
  974. ----------
  975. corpus : iterable of object
  976. An iterable.
  977. chunksize : int
  978. Split `corpus` into chunks of this size.
  979. maxsize : int, optional
  980. Ignored. For interface compatibility only.
  981. as_numpy : bool, optional
  982. Yield chunks as `np.ndarray`s instead of lists?
  983. Yields
  984. ------
  985. list OR np.ndarray
  986. "chunksize"-ed chunks of elements from `corpus`.
  987. """
  988. for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy):
  989. yield chunk
  990. else:
  991. def chunkize(corpus, chunksize, maxsize=0, as_numpy=False):
  992. """Split `corpus` into fixed-sized chunks, using :func:`~gensim.utils.chunkize_serial`.
  993. Parameters
  994. ----------
  995. corpus : iterable of object
  996. An iterable.
  997. chunksize : int
  998. Split `corpus` into chunks of this size.
  999. maxsize : int, optional
  1000. If > 0, prepare chunks in a background process, filling a chunk queue of size at most `maxsize`.
  1001. as_numpy : bool, optional
  1002. Yield chunks as `np.ndarray` instead of lists?
  1003. Yields
  1004. ------
  1005. list OR np.ndarray
  1006. "chunksize"-ed chunks of elements from `corpus`.
  1007. Notes
  1008. -----
  1009. Each chunk is of length `chunksize`, except the last one which may be smaller.
  1010. A once-only input stream (`corpus` from a generator) is ok, chunking is done efficiently via itertools.
  1011. If `maxsize > 0`, don't wait idly in between successive chunk `yields`, but rather keep filling a short queue
  1012. (of size at most `maxsize`) with forthcoming chunks in advance. This is realized by starting a separate process,
  1013. and is meant to reduce I/O delays, which can be significant when `corpus` comes from a slow medium
  1014. like HDD, database or network.
  1015. If `maxsize == 0`, don't fool around with parallelism and simply yield the chunksize
  1016. via :func:`~gensim.utils.chunkize_serial` (no I/O optimizations).
  1017. Yields
  1018. ------
  1019. list of object OR np.ndarray
  1020. Groups based on `iterable`
  1021. """
  1022. assert chunksize > 0
  1023. if maxsize > 0:
  1024. q = multiprocessing.Queue(maxsize=maxsize)
  1025. worker = InputQueue(q, corpus, chunksize, maxsize=maxsize, as_numpy=as_numpy)
  1026. worker.daemon = True
  1027. worker.start()
  1028. while True:
  1029. chunk = [q.get(block=True)]
  1030. if chunk[0] is None:
  1031. break
  1032. yield chunk.pop()
  1033. else:
  1034. for chunk in chunkize_serial(corpus, chunksize, as_numpy=as_numpy):
  1035. yield chunk
  1036. def smart_extension(fname, ext):
  1037. """Append a file extension `ext` to `fname`, while keeping compressed extensions like `.bz2` or
  1038. `.gz` (if any) at the end.
  1039. Parameters
  1040. ----------
  1041. fname : str
  1042. Filename or full path.
  1043. ext : str
  1044. Extension to append before any compression extensions.
  1045. Returns
  1046. -------
  1047. str
  1048. New path to file with `ext` appended.
  1049. Examples
  1050. --------
  1051. >>> from gensim.utils import smart_extension
  1052. >>> smart_extension("my_file.pkl.gz", ".vectors")
  1053. 'my_file.pkl.vectors.gz'
  1054. """
  1055. fname, oext = os.path.splitext(fname)
  1056. if oext.endswith('.bz2'):
  1057. fname = fname + oext[:-4] + ext + '.bz2'
  1058. elif oext.endswith('.gz'):
  1059. fname = fname + oext[:-3] + ext + '.gz'
  1060. else:
  1061. fname = fname + oext + ext
  1062. return fname
  1063. def pickle(obj, fname, protocol=2):
  1064. """Pickle object `obj` to file `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc.
  1065. Parameters
  1066. ----------
  1067. obj : object
  1068. Any python object.
  1069. fname : str
  1070. Path to pickle file.
  1071. protocol : int, optional
  1072. Pickle protocol number. Default is 2 in order to support compatibility across python 2.x and 3.x.
  1073. """
  1074. with smart_open(fname, 'wb') as fout: # 'b' for binary, needed on Windows
  1075. _pickle.dump(obj, fout, protocol=protocol)
  1076. def unpickle(fname):
  1077. """Load object from `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc.
  1078. Parameters
  1079. ----------
  1080. fname : str
  1081. Path to pickle file.
  1082. Returns
  1083. -------
  1084. object
  1085. Python object loaded from `fname`.
  1086. """
  1087. with smart_open(fname, 'rb') as f:
  1088. # Because of loading from S3 load can't be used (missing readline in smart_open)
  1089. if sys.version_info > (3, 0):
  1090. return _pickle.load(f, encoding='latin1')
  1091. else:
  1092. return _pickle.loads(f.read())
  1093. def revdict(d):
  1094. """Reverse a dictionary mapping, i.e. `{1: 2, 3: 4}` -> `{2: 1, 4: 3}`.
  1095. Parameters
  1096. ----------
  1097. d : dict
  1098. Input dictionary.
  1099. Returns
  1100. -------
  1101. dict
  1102. Reversed dictionary mapping.
  1103. Notes
  1104. -----
  1105. When two keys map to the same value, only one of them will be kept in the result (which one is kept is arbitrary).
  1106. Examples
  1107. --------
  1108. >>> from gensim.utils import revdict
  1109. >>> d = {1: 2, 3: 4}
  1110. >>> revdict(d)
  1111. {2: 1, 4: 3}
  1112. """
  1113. return {v: k for (k, v) in iteritems(dict(d))}
  1114. def deprecated(reason):
  1115. """Decorator to mark functions as deprecated.
  1116. Calling a decorated function will result in a warning being emitted, using warnings.warn.
  1117. Adapted from https://stackoverflow.com/a/40301488/8001386.
  1118. Parameters
  1119. ----------
  1120. reason : str
  1121. Reason of deprecation.
  1122. Returns
  1123. -------
  1124. function
  1125. Decorated function
  1126. """
  1127. if isinstance(reason, string_types):
  1128. def decorator(func):
  1129. fmt = "Call to deprecated `{name}` ({reason})."
  1130. @wraps(func)
  1131. def new_func1(*args, **kwargs):
  1132. warnings.warn(
  1133. fmt.format(name=func.__name__, reason=reason),
  1134. category=DeprecationWarning,
  1135. stacklevel=2
  1136. )
  1137. return func(*args, **kwargs)
  1138. return new_func1
  1139. return decorator
  1140. elif inspect.isclass(reason) or inspect.isfunction(reason):
  1141. func = reason
  1142. fmt = "Call to deprecated `{name}`."
  1143. @wraps(func)
  1144. def new_func2(*args, **kwargs):
  1145. warnings.warn(
  1146. fmt.format(name=func.__name__),
  1147. category=DeprecationWarning,
  1148. stacklevel=2
  1149. )
  1150. return func(*args, **kwargs)
  1151. return new_func2
  1152. else:
  1153. raise TypeError(repr(type(reason)))
  1154. @deprecated("Function will be removed in 4.0.0")
  1155. def toptexts(query, texts, index, n=10):
  1156. """Debug fnc to help inspect the top `n` most similar documents (according to a similarity index `index`),
  1157. to see if they are actually related to the query.
  1158. Parameters
  1159. ----------
  1160. query : {list of (int, number), numpy.ndarray}
  1161. vector OR BoW (list of tuples)
  1162. texts : str
  1163. object that can return something insightful for each document via `texts[docid]`,
  1164. such as its fulltext or snippet.
  1165. index : any
  1166. A instance from from :mod:`gensim.similarity.docsim`.
  1167. Return
  1168. ------
  1169. list
  1170. a list of 3-tuples (docid, doc's similarity to the query, texts[docid])
  1171. """
  1172. sims = index[query] # perform a similarity query against the corpus
  1173. sims = sorted(enumerate(sims), key=lambda item: -item[1])
  1174. return [(topid, topcosine, texts[topid]) for topid, topcosine in sims[:n]] # only consider top-n most similar docs
  1175. def randfname(prefix='gensim'):
  1176. """Generate a random filename in temp.
  1177. Parameters
  1178. ----------
  1179. prefix : str
  1180. Prefix of filename.
  1181. Returns
  1182. -------
  1183. str
  1184. Full path in the in system's temporary folder, ending in a random filename.
  1185. """
  1186. randpart = hex(random.randint(0, 0xffffff))[2:]
  1187. return os.path.join(tempfile.gettempdir(), prefix + randpart)
  1188. @deprecated("Function will be removed in 4.0.0")
  1189. def upload_chunked(server, docs, chunksize=1000, preprocess=None):
  1190. """Memory-friendly upload of documents to a SimServer (or Pyro SimServer proxy).
  1191. Notes
  1192. -----
  1193. Use this function to train or index large collections -- avoid sending the
  1194. entire corpus over the wire as a single Pyro in-memory object. The documents
  1195. will be sent in smaller chunks, of `chunksize` documents each.
  1196. """
  1197. start = 0
  1198. for chunk in grouper(docs, chunksize):
  1199. end = start + len(chunk)
  1200. logger.info("uploading documents %i-%i", start, end - 1)
  1201. if preprocess is not None:
  1202. pchunk = []
  1203. for doc in chunk:
  1204. doc['tokens'] = preprocess(doc['text'])
  1205. del doc['text']
  1206. pchunk.append(doc)
  1207. chunk = pchunk
  1208. server.buffer(chunk)
  1209. start = end
  1210. def getNS(host=None, port=None, broadcast=True, hmac_key=None):
  1211. """Get a Pyro4 name server proxy.
  1212. Parameters
  1213. ----------
  1214. host : str, optional
  1215. Name server hostname.
  1216. port : int, optional
  1217. Name server port.
  1218. broadcast : bool, optional
  1219. Use broadcast mechanism? (i.e. reach out to all Pyro nodes in the network)
  1220. hmac_key : str, optional
  1221. Private key.
  1222. Raises
  1223. ------
  1224. RuntimeError
  1225. When Pyro name server is not found.
  1226. Returns
  1227. -------
  1228. :class:`Pyro4.core.Proxy`
  1229. Proxy from Pyro4.
  1230. """
  1231. import Pyro4
  1232. try:
  1233. return Pyro4.locateNS(host, port, broadcast, hmac_key)
  1234. except Pyro4.errors.NamingError:
  1235. raise RuntimeError("Pyro name server not found")
  1236. def pyro_daemon(name, obj, random_suffix=False, ip=None, port=None, ns_conf=None):
  1237. """Register an object with the Pyro name server.
  1238. Start the name server if not running yet and block until the daemon is terminated.
  1239. The object is registered under `name`, or `name`+ some random suffix if `random_suffix` is set.
  1240. """
  1241. if ns_conf is None:
  1242. ns_conf = {}
  1243. if random_suffix:
  1244. name += '.' + hex(random.randint(0, 0xffffff))[2:]
  1245. import Pyro4
  1246. with getNS(**ns_conf) as ns:
  1247. with Pyro4.Daemon(ip or get_my_ip(), port or 0) as daemon:
  1248. # register server for remote access
  1249. uri = daemon.register(obj, name)
  1250. ns.remove(name)
  1251. ns.register(name, uri)
  1252. logger.info("%s registered with nameserver (URI '%s')", name, uri)
  1253. daemon.requestLoop()
  1254. def has_pattern():
  1255. """Check whether the `pattern <https://github.com/clips/pattern>`_ package is installed.
  1256. Returns
  1257. -------
  1258. bool
  1259. Is `pattern` installed?
  1260. """
  1261. try:
  1262. from pattern.en import parse # noqa:F401
  1263. return True
  1264. except ImportError:
  1265. return False
  1266. def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False,
  1267. stopwords=frozenset(), min_length=2, max_length=15):
  1268. """Use the English lemmatizer from `pattern <https://github.com/clips/pattern>`_ to extract UTF8-encoded tokens in
  1269. their base form aka lemma, e.g. "are, is, being" becomes "be" etc.
  1270. This is a smarter version of stemming, taking word context into account.
  1271. Parameters
  1272. ----------
  1273. content : str
  1274. Input string
  1275. allowed_tags : :class:`_sre.SRE_Pattern`, optional
  1276. Compiled regexp to select POS that will be used.
  1277. Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded).
  1278. light : bool, optional
  1279. DEPRECATED FLAG, DOESN'T SUPPORT BY `pattern`.
  1280. stopwords : frozenset, optional
  1281. Set of words that will be removed from output.
  1282. min_length : int, optional
  1283. Minimal token length in output (inclusive).
  1284. max_length : int, optional
  1285. Maximal token length in output (inclusive).
  1286. Returns
  1287. -------
  1288. list of str
  1289. List with tokens with POS tags.
  1290. Warnings
  1291. --------
  1292. This function is only available when the optional `pattern <https://github.com/clips/pattern>`_ is installed.
  1293. Raises
  1294. ------
  1295. ImportError
  1296. If `pattern <https://github.com/clips/pattern>`_ not installed.
  1297. Examples
  1298. --------
  1299. >>> from gensim.utils import lemmatize
  1300. >>> lemmatize('Hello World! How is it going?! Nonexistentword, 21')
  1301. ['world/NN', 'be/VB', 'go/VB', 'nonexistentword/NN']
  1302. Note the context-dependent part-of-speech tags between these two examples:
  1303. >>> lemmatize('The study ranks high.')
  1304. ['study/NN', 'rank/VB', 'high/JJ']
  1305. >>> lemmatize('The ranks study hard.')
  1306. ['rank/NN', 'study/VB', 'hard/RB']
  1307. """
  1308. if not has_pattern():
  1309. raise ImportError(
  1310. "Pattern library is not installed. Pattern library is needed in order to use lemmatize function"
  1311. )
  1312. from pattern.en import parse
  1313. if light:
  1314. import warnings
  1315. warnings.warn("The light flag is no longer supported by pattern.")
  1316. # tokenization in `pattern` is weird; it gets thrown off by non-letters,
  1317. # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
  1318. # FIXME this throws away all fancy parsing cues, including sentence structure,
  1319. # abbreviations etc.
  1320. content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
  1321. parsed = parse(content, lemmata=True, collapse=False)
  1322. result = []
  1323. for sentence in parsed:
  1324. for token, tag, _, _, lemma in sentence:
  1325. if min_length <= len(lemma) <= max_length and not lemma.startswith('_') and lemma not in stopwords:
  1326. if allowed_tags.match(tag):
  1327. lemma += "/" + tag[:2]
  1328. result.append(lemma.encode('utf8'))
  1329. return result
  1330. def mock_data_row(dim=1000, prob_nnz=0.5, lam=1.0):
  1331. """Create a random gensim BoW vector, with the feature counts following the Poisson distribution.
  1332. Parameters
  1333. ----------
  1334. dim : int, optional
  1335. Dimension of vector.
  1336. prob_nnz : float, optional
  1337. Probability of each coordinate will be nonzero, will be drawn from the Poisson distribution.
  1338. lam : float, optional
  1339. Lambda parameter for the Poisson distribution.
  1340. Returns
  1341. -------
  1342. list of (int, float)
  1343. Vector in BoW format.
  1344. """
  1345. nnz = np.random.uniform(size=(dim,))
  1346. return [(i, float(np.random.poisson(lam=lam) + 1.0)) for i in xrange(dim) if nnz[i] < prob_nnz]
  1347. def mock_data(n_items=1000, dim=1000, prob_nnz=0.5, lam=1.0):
  1348. """Create a random Gensim-style corpus (BoW), using :func:`~gensim.utils.mock_data_row`.
  1349. Parameters
  1350. ----------
  1351. n_items : int
  1352. Size of corpus
  1353. dim : int
  1354. Dimension of vector, used for :func:`~gensim.utils.mock_data_row`.
  1355. prob_nnz : float, optional
  1356. Probability of each coordinate will be nonzero, will be drawn from Poisson distribution,
  1357. used for :func:`~gensim.utils.mock_data_row`.
  1358. lam : float, optional
  1359. Parameter for Poisson distribution, used for :func:`~gensim.utils.mock_data_row`.
  1360. Returns
  1361. -------
  1362. list of list of (int, float)
  1363. Gensim-style corpus.
  1364. """
  1365. return [mock_data_row(dim=dim, prob_nnz=prob_nnz, lam=lam) for _ in xrange(n_items)]
  1366. def prune_vocab(vocab, min_reduce, trim_rule=None):
  1367. """Remove all entries from the `vocab` dictionary with count smaller than `min_reduce`.
  1368. Modifies `vocab` in place, returns the sum of all counts that were pruned.
  1369. Parameters
  1370. ----------
  1371. vocab : dict
  1372. Input dictionary.
  1373. min_reduce : int
  1374. Frequency threshold for tokens in `vocab`.
  1375. trim_rule : function, optional
  1376. Function for trimming entities from vocab, default behaviour is `vocab[w] <= min_reduce`.
  1377. Returns
  1378. -------
  1379. result : int
  1380. Sum of all counts that were pruned.
  1381. """
  1382. result = 0
  1383. old_len = len(vocab)
  1384. for w in list(vocab): # make a copy of dict's keys
  1385. if not keep_vocab_item(w, vocab[w], min_reduce, trim_rule): # vocab[w] <= min_reduce:
  1386. result += vocab[w]
  1387. del vocab[w]
  1388. logger.info(
  1389. "pruned out %i tokens with count <=%i (before %i, after %i)",
  1390. old_len - len(vocab), min_reduce, old_len, len(vocab)
  1391. )
  1392. return result
  1393. def trim_vocab_by_freq(vocab, topk, trim_rule=None):
  1394. """Retain `topk` most frequent words in `vocab`.
  1395. If there are more words with the same frequency as `topk`-th one, they will be kept.
  1396. Modifies `vocab` in place, returns nothing.
  1397. Parameters
  1398. ----------
  1399. vocab : dict
  1400. Input dictionary.
  1401. topk : int
  1402. Number of words with highest frequencies to keep.
  1403. trim_rule : function, optional
  1404. Function for trimming entities from vocab, default behaviour is `vocab[w] <= min_count`.
  1405. """
  1406. if topk >= len(vocab):
  1407. return
  1408. min_count = heapq.nlargest(topk, itervalues(vocab))[-1]
  1409. prune_vocab(vocab, min_count, trim_rule=trim_rule)
  1410. def merge_counts(dict1, dict2):
  1411. """Merge `dict1` of (word, freq1) and `dict2` of (word, freq2) into `dict1` of (word, freq1+freq2).
  1412. Parameters
  1413. ----------
  1414. dict1 : dict of (str, int)
  1415. First dictionary.
  1416. dict2 : dict of (str, int)
  1417. Second dictionary.
  1418. Returns
  1419. -------
  1420. result : dict
  1421. Merged dictionary with sum of frequencies as values.
  1422. """
  1423. for word, freq in iteritems(dict2):
  1424. if word in dict1:
  1425. dict1[word] += freq
  1426. else:
  1427. dict1[word] = freq
  1428. return dict1
  1429. def qsize(queue):
  1430. """Get the (approximate) queue size where available.
  1431. Parameters
  1432. ----------
  1433. queue : :class:`queue.Queue`
  1434. Input queue.
  1435. Returns
  1436. -------
  1437. int
  1438. Queue size, -1 if `qsize` method isn't implemented (OS X).
  1439. """
  1440. try:
  1441. return queue.qsize()
  1442. except NotImplementedError:
  1443. # OS X doesn't support qsize
  1444. return -1
  1445. RULE_DEFAULT = 0
  1446. RULE_DISCARD = 1
  1447. RULE_KEEP = 2
  1448. def keep_vocab_item(word, count, min_count, trim_rule=None):
  1449. """Should we keep `word` in the vocab or remove it?
  1450. Parameters
  1451. ----------
  1452. word : str
  1453. Input word.
  1454. count : int
  1455. Number of times that word appeared in a corpus.
  1456. min_count : int
  1457. Discard words with frequency smaller than this.
  1458. trim_rule : function, optional
  1459. Custom function to decide whether to keep or discard this word.
  1460. If a custom `trim_rule` is not specified, the default behaviour is simply `count >= min_count`.
  1461. Returns
  1462. -------
  1463. bool
  1464. True if `word` should stay, False otherwise.
  1465. """
  1466. default_res = count >= min_count
  1467. if trim_rule is None:
  1468. return default_res
  1469. else:
  1470. rule_res = trim_rule(word, count, min_count)
  1471. if rule_res == RULE_KEEP:
  1472. return True
  1473. elif rule_res == RULE_DISCARD:
  1474. return False
  1475. else:
  1476. return default_res
  1477. def check_output(stdout=subprocess.PIPE, *popenargs, **kwargs):
  1478. r"""Run OS command with the given arguments and return its output as a byte string.
  1479. Backported from Python 2.7 with a few minor modifications. Widely used for :mod:`gensim.models.wrappers`.
  1480. Behaves very similar to https://docs.python.org/2/library/subprocess.html#subprocess.check_output.
  1481. Examples
  1482. --------
  1483. >>> from gensim.utils import check_output
  1484. >>> check_output(args=['echo', '1'])
  1485. '1\n'
  1486. Raises
  1487. ------
  1488. KeyboardInterrupt
  1489. If Ctrl+C pressed.
  1490. """
  1491. try:
  1492. logger.debug("COMMAND: %s %s", popenargs, kwargs)
  1493. process = subprocess.Popen(stdout=stdout, *popenargs, **kwargs)
  1494. output, unused_err = process.communicate()
  1495. retcode = process.poll()
  1496. if retcode:
  1497. cmd = kwargs.get("args")
  1498. if cmd is None:
  1499. cmd = popenargs[0]
  1500. error = subprocess.CalledProcessError(retcode, cmd)
  1501. error.output = output
  1502. raise error
  1503. return output
  1504. except KeyboardInterrupt:
  1505. process.terminate()
  1506. raise
  1507. def sample_dict(d, n=10, use_random=True):
  1508. """Selected `n` (possibly random) items from the dictionary `d`.
  1509. Parameters
  1510. ----------
  1511. d : dict
  1512. Input dictionary.
  1513. n : int, optional
  1514. Number of items to select.
  1515. use_random : bool, optional
  1516. Select items randomly (without replacement), instead of by the natural dict iteration order?
  1517. Returns
  1518. -------
  1519. list of (object, object)
  1520. Selected items from dictionary, as a list.
  1521. """
  1522. selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(iterkeys(d), n)
  1523. return [(key, d[key]) for key in selected_keys]
  1524. def strided_windows(ndarray, window_size):
  1525. """Produce a numpy.ndarray of windows, as from a sliding window.
  1526. Parameters
  1527. ----------
  1528. ndarray : numpy.ndarray
  1529. Input array
  1530. window_size : int
  1531. Sliding window size.
  1532. Returns
  1533. -------
  1534. numpy.ndarray
  1535. Subsequences produced by sliding a window of the given size over the `ndarray`.
  1536. Since this uses striding, the individual arrays are views rather than copies of `ndarray`.
  1537. Changes to one view modifies the others and the original.
  1538. Examples
  1539. --------
  1540. >>> from gensim.utils import strided_windows
  1541. >>> strided_windows(np.arange(5), 2)
  1542. array([[0, 1],
  1543. [1, 2],
  1544. [2, 3],
  1545. [3, 4]])
  1546. >>> strided_windows(np.arange(10), 5)
  1547. array([[0, 1, 2, 3, 4],
  1548. [1, 2, 3, 4, 5],
  1549. [2, 3, 4, 5, 6],
  1550. [3, 4, 5, 6, 7],
  1551. [4, 5, 6, 7, 8],
  1552. [5, 6, 7, 8, 9]])
  1553. """
  1554. ndarray = np.asarray(ndarray)
  1555. if window_size == ndarray.shape[0]:
  1556. return np.array([ndarray])
  1557. elif window_size > ndarray.shape[0]:
  1558. return np.ndarray((0, 0))
  1559. stride = ndarray.strides[0]
  1560. return np.lib.stride_tricks.as_strided(
  1561. ndarray, shape=(ndarray.shape[0] - window_size + 1, window_size),
  1562. strides=(stride, stride))
  1563. def iter_windows(texts, window_size, copy=False, ignore_below_size=True, include_doc_num=False):
  1564. """Produce a generator over the given texts using a sliding window of `window_size`.
  1565. The windows produced are views of some subsequence of a text.
  1566. To use deep copies instead, pass `copy=True`.
  1567. Parameters
  1568. ----------
  1569. texts : list of str
  1570. List of string sentences.
  1571. window_size : int
  1572. Size of sliding window.
  1573. copy : bool, optional
  1574. Produce deep copies.
  1575. ignore_below_size : bool, optional
  1576. Ignore documents that are not at least `window_size` in length?
  1577. include_doc_num : bool, optional
  1578. Yield the text position with `texts` along with each window?
  1579. """
  1580. for doc_num, document in enumerate(texts):
  1581. for window in _iter_windows(document, window_size, copy, ignore_below_size):
  1582. if include_doc_num:
  1583. yield (doc_num, window)
  1584. else:
  1585. yield window
  1586. def _iter_windows(document, window_size, copy=False, ignore_below_size=True):
  1587. doc_windows = strided_windows(document, window_size)
  1588. if doc_windows.shape[0] == 0:
  1589. if not ignore_below_size:
  1590. yield document.copy() if copy else document
  1591. else:
  1592. for doc_window in doc_windows:
  1593. yield doc_window.copy() if copy else doc_window
  1594. def flatten(nested_list):
  1595. """Recursively flatten a nested sequence of elements.
  1596. Parameters
  1597. ----------
  1598. nested_list : iterable
  1599. Possibly nested sequence of elements to flatten.
  1600. Returns
  1601. -------
  1602. list
  1603. Flattened version of `nested_list` where any elements that are an iterable (`collections.Iterable`)
  1604. have been unpacked into the top-level list, in a recursive fashion.
  1605. """
  1606. return list(lazy_flatten(nested_list))
  1607. def lazy_flatten(nested_list):
  1608. """Lazy version of :func:`~gensim.utils.flatten`.
  1609. Parameters
  1610. ----------
  1611. nested_list : list
  1612. Possibly nested list.
  1613. Yields
  1614. ------
  1615. object
  1616. Element of list
  1617. """
  1618. for el in nested_list:
  1619. if isinstance(el, collections.Iterable) and not isinstance(el, string_types):
  1620. for sub in flatten(el):
  1621. yield sub
  1622. else:
  1623. yield el
  1624. def save_as_line_sentence(corpus, filename):
  1625. """Save the corpus in LineSentence format, i.e. each sentence on a separate line,
  1626. tokens are separated by space.
  1627. Parameters
  1628. ----------
  1629. corpus : iterable of iterables of strings
  1630. """
  1631. with smart_open(filename, mode='wb', encoding='utf8') as fout:
  1632. for sentence in corpus:
  1633. line = any2unicode(' '.join(sentence) + '\n')
  1634. fout.write(line)
  1635. def effective_n_jobs(n_jobs):
  1636. """Determines the number of jobs can run in parallel.
  1637. Just like in sklearn, passing n_jobs=-1 means using all available
  1638. CPU cores.
  1639. Parameters
  1640. ----------
  1641. n_jobs : int
  1642. Number of workers requested by caller.
  1643. Returns
  1644. -------
  1645. int
  1646. Number of effective jobs.
  1647. """
  1648. if n_jobs == 0:
  1649. raise ValueError('n_jobs == 0 in Parallel has no meaning')
  1650. elif n_jobs is None:
  1651. return 1
  1652. elif n_jobs < 0:
  1653. n_jobs = max(cpu_count() + 1 + n_jobs, 1)
  1654. return n_jobs