You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1578 lines
52 KiB

4 years ago
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
  5. # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  6. """Math helper functions."""
  7. from __future__ import with_statement
  8. from itertools import chain
  9. import logging
  10. import math
  11. from gensim import utils
  12. import numpy as np
  13. import scipy.sparse
  14. from scipy.stats import entropy
  15. import scipy.linalg
  16. from scipy.linalg.lapack import get_lapack_funcs
  17. from scipy.linalg.special_matrices import triu
  18. from scipy.special import psi # gamma function utils
  19. from six import iteritems, itervalues, string_types
  20. from six.moves import xrange, zip as izip
  21. logger = logging.getLogger(__name__)
  22. def blas(name, ndarray):
  23. """Helper for getting the appropriate BLAS function, using :func:`scipy.linalg.get_blas_funcs`.
  24. Parameters
  25. ----------
  26. name : str
  27. Name(s) of BLAS functions, without the type prefix.
  28. ndarray : numpy.ndarray
  29. Arrays can be given to determine optimal prefix of BLAS routines.
  30. Returns
  31. -------
  32. object
  33. BLAS function for the needed operation on the given data type.
  34. """
  35. return scipy.linalg.get_blas_funcs((name,), (ndarray,))[0]
  36. def argsort(x, topn=None, reverse=False):
  37. """Efficiently calculate indices of the `topn` smallest elements in array `x`.
  38. Parameters
  39. ----------
  40. x : array_like
  41. Array to get the smallest element indices from.
  42. topn : int, optional
  43. Number of indices of the smallest (greatest) elements to be returned.
  44. If not given, indices of all elements will be returned in ascending (descending) order.
  45. reverse : bool, optional
  46. Return the `topn` greatest elements in descending order,
  47. instead of smallest elements in ascending order?
  48. Returns
  49. -------
  50. numpy.ndarray
  51. Array of `topn` indices that sort the array in the requested order.
  52. """
  53. x = np.asarray(x) # unify code path for when `x` is not a np array (list, tuple...)
  54. if topn is None:
  55. topn = x.size
  56. if topn <= 0:
  57. return []
  58. if reverse:
  59. x = -x
  60. if topn >= x.size or not hasattr(np, 'argpartition'):
  61. return np.argsort(x)[:topn]
  62. # np >= 1.8 has a fast partial argsort, use that!
  63. most_extreme = np.argpartition(x, topn)[:topn]
  64. return most_extreme.take(np.argsort(x.take(most_extreme))) # resort topn into order
  65. def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=None, printprogress=0):
  66. """Convert a streamed corpus in bag-of-words format into a sparse matrix `scipy.sparse.csc_matrix`,
  67. with documents as columns.
  68. Notes
  69. -----
  70. If the number of terms, documents and non-zero elements is known, you can pass
  71. them here as parameters and a (much) more memory efficient code path will be taken.
  72. Parameters
  73. ----------
  74. corpus : iterable of iterable of (int, number)
  75. Input corpus in BoW format
  76. num_terms : int, optional
  77. Number of terms in `corpus`. If provided, the `corpus.num_terms` attribute (if any) will be ignored.
  78. dtype : data-type, optional
  79. Data type of output CSC matrix.
  80. num_docs : int, optional
  81. Number of documents in `corpus`. If provided, the `corpus.num_docs` attribute (in any) will be ignored.
  82. num_nnz : int, optional
  83. Number of non-zero elements in `corpus`. If provided, the `corpus.num_nnz` attribute (if any) will be ignored.
  84. printprogress : int, optional
  85. Log a progress message at INFO level once every `printprogress` documents. 0 to turn off progress logging.
  86. Returns
  87. -------
  88. scipy.sparse.csc_matrix
  89. `corpus` converted into a sparse CSC matrix.
  90. See Also
  91. --------
  92. :class:`~gensim.matutils.Sparse2Corpus`
  93. Convert sparse format to Gensim corpus format.
  94. """
  95. try:
  96. # if the input corpus has the `num_nnz`, `num_docs` and `num_terms` attributes
  97. # (as is the case with MmCorpus for example), we can use a more efficient code path
  98. if num_terms is None:
  99. num_terms = corpus.num_terms
  100. if num_docs is None:
  101. num_docs = corpus.num_docs
  102. if num_nnz is None:
  103. num_nnz = corpus.num_nnz
  104. except AttributeError:
  105. pass # not a MmCorpus...
  106. if printprogress:
  107. logger.info("creating sparse matrix from corpus")
  108. if num_terms is not None and num_docs is not None and num_nnz is not None:
  109. # faster and much more memory-friendly version of creating the sparse csc
  110. posnow, indptr = 0, [0]
  111. indices = np.empty((num_nnz,), dtype=np.int32) # HACK assume feature ids fit in 32bit integer
  112. data = np.empty((num_nnz,), dtype=dtype)
  113. for docno, doc in enumerate(corpus):
  114. if printprogress and docno % printprogress == 0:
  115. logger.info("PROGRESS: at document #%i/%i", docno, num_docs)
  116. posnext = posnow + len(doc)
  117. indices[posnow: posnext] = [feature_id for feature_id, _ in doc]
  118. data[posnow: posnext] = [feature_weight for _, feature_weight in doc]
  119. indptr.append(posnext)
  120. posnow = posnext
  121. assert posnow == num_nnz, "mismatch between supplied and computed number of non-zeros"
  122. result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype)
  123. else:
  124. # slower version; determine the sparse matrix parameters during iteration
  125. num_nnz, data, indices, indptr = 0, [], [], [0]
  126. for docno, doc in enumerate(corpus):
  127. if printprogress and docno % printprogress == 0:
  128. logger.info("PROGRESS: at document #%i", docno)
  129. indices.extend([feature_id for feature_id, _ in doc])
  130. data.extend([feature_weight for _, feature_weight in doc])
  131. num_nnz += len(doc)
  132. indptr.append(num_nnz)
  133. if num_terms is None:
  134. num_terms = max(indices) + 1 if indices else 0
  135. num_docs = len(indptr) - 1
  136. # now num_docs, num_terms and num_nnz contain the correct values
  137. data = np.asarray(data, dtype=dtype)
  138. indices = np.asarray(indices)
  139. result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype)
  140. return result
  141. def pad(mat, padrow, padcol):
  142. """Add additional rows/columns to `mat`. The new rows/columns will be initialized with zeros.
  143. Parameters
  144. ----------
  145. mat : numpy.ndarray
  146. Input 2D matrix
  147. padrow : int
  148. Number of additional rows
  149. padcol : int
  150. Number of additional columns
  151. Returns
  152. -------
  153. numpy.matrixlib.defmatrix.matrix
  154. Matrix with needed padding.
  155. """
  156. if padrow < 0:
  157. padrow = 0
  158. if padcol < 0:
  159. padcol = 0
  160. rows, cols = mat.shape
  161. return np.bmat([
  162. [mat, np.matrix(np.zeros((rows, padcol)))],
  163. [np.matrix(np.zeros((padrow, cols + padcol)))],
  164. ])
  165. def zeros_aligned(shape, dtype, order='C', align=128):
  166. """Get array aligned at `align` byte boundary in memory.
  167. Parameters
  168. ----------
  169. shape : int or (int, int)
  170. Shape of array.
  171. dtype : data-type
  172. Data type of array.
  173. order : {'C', 'F'}, optional
  174. Whether to store multidimensional data in C- or Fortran-contiguous (row- or column-wise) order in memory.
  175. align : int, optional
  176. Boundary for alignment in bytes.
  177. Returns
  178. -------
  179. numpy.ndarray
  180. Aligned array.
  181. """
  182. nbytes = np.prod(shape, dtype=np.int64) * np.dtype(dtype).itemsize
  183. buffer = np.zeros(nbytes + align, dtype=np.uint8) # problematic on win64 ("maximum allowed dimension exceeded")
  184. start_index = -buffer.ctypes.data % align
  185. return buffer[start_index: start_index + nbytes].view(dtype).reshape(shape, order=order)
  186. def ismatrix(m):
  187. """Check whether `m` is a 2D `numpy.ndarray` or `scipy.sparse` matrix.
  188. Parameters
  189. ----------
  190. m : object
  191. Object to check.
  192. Returns
  193. -------
  194. bool
  195. Is `m` a 2D `numpy.ndarray` or `scipy.sparse` matrix.
  196. """
  197. return isinstance(m, np.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m)
  198. def any2sparse(vec, eps=1e-9):
  199. """Convert a numpy.ndarray or `scipy.sparse` vector into the Gensim bag-of-words format.
  200. Parameters
  201. ----------
  202. vec : {`numpy.ndarray`, `scipy.sparse`}
  203. Input vector
  204. eps : float, optional
  205. Value used for threshold, all coordinates less than `eps` will not be presented in result.
  206. Returns
  207. -------
  208. list of (int, float)
  209. Vector in BoW format.
  210. """
  211. if isinstance(vec, np.ndarray):
  212. return dense2vec(vec, eps)
  213. if scipy.sparse.issparse(vec):
  214. return scipy2sparse(vec, eps)
  215. return [(int(fid), float(fw)) for fid, fw in vec if np.abs(fw) > eps]
  216. def scipy2scipy_clipped(matrix, topn, eps=1e-9):
  217. """Get the 'topn' elements of the greatest magnitude (absolute value) from a `scipy.sparse` vector or matrix.
  218. Parameters
  219. ----------
  220. matrix : `scipy.sparse`
  221. Input vector or matrix (1D or 2D sparse array).
  222. topn : int
  223. Number of greatest elements, in absolute value, to return.
  224. eps : float
  225. Ignored.
  226. Returns
  227. -------
  228. `scipy.sparse.csr.csr_matrix`
  229. Clipped matrix.
  230. """
  231. if not scipy.sparse.issparse(matrix):
  232. raise ValueError("'%s' is not a scipy sparse vector." % matrix)
  233. if topn <= 0:
  234. return scipy.sparse.csr_matrix([])
  235. # Return clipped sparse vector if input is a sparse vector.
  236. if matrix.shape[0] == 1:
  237. # use np.argpartition/argsort and only form tuples that are actually returned.
  238. biggest = argsort(abs(matrix.data), topn, reverse=True)
  239. indices, data = matrix.indices.take(biggest), matrix.data.take(biggest)
  240. return scipy.sparse.csr_matrix((data, indices, [0, len(indices)]))
  241. # Return clipped sparse matrix if input is a matrix, processing row by row.
  242. else:
  243. matrix_indices = []
  244. matrix_data = []
  245. matrix_indptr = [0]
  246. # calling abs() on entire matrix once is faster than calling abs() iteratively for each row
  247. matrix_abs = abs(matrix)
  248. for i in range(matrix.shape[0]):
  249. v = matrix.getrow(i)
  250. v_abs = matrix_abs.getrow(i)
  251. # Sort and clip each row vector first.
  252. biggest = argsort(v_abs.data, topn, reverse=True)
  253. indices, data = v.indices.take(biggest), v.data.take(biggest)
  254. # Store the topn indices and values of each row vector.
  255. matrix_data.append(data)
  256. matrix_indices.append(indices)
  257. matrix_indptr.append(matrix_indptr[-1] + min(len(indices), topn))
  258. matrix_indices = np.concatenate(matrix_indices).ravel()
  259. matrix_data = np.concatenate(matrix_data).ravel()
  260. # Instantiate and return a sparse csr_matrix which preserves the order of indices/data.
  261. return scipy.sparse.csr.csr_matrix(
  262. (matrix_data, matrix_indices, matrix_indptr),
  263. shape=(matrix.shape[0], np.max(matrix_indices) + 1)
  264. )
  265. def scipy2sparse(vec, eps=1e-9):
  266. """Convert a scipy.sparse vector into the Gensim bag-of-words format.
  267. Parameters
  268. ----------
  269. vec : `scipy.sparse`
  270. Sparse vector.
  271. eps : float, optional
  272. Value used for threshold, all coordinates less than `eps` will not be presented in result.
  273. Returns
  274. -------
  275. list of (int, float)
  276. Vector in Gensim bag-of-words format.
  277. """
  278. vec = vec.tocsr()
  279. assert vec.shape[0] == 1
  280. return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if np.abs(val) > eps]
  281. class Scipy2Corpus(object):
  282. """Convert a sequence of dense/sparse vectors into a streamed Gensim corpus object.
  283. See Also
  284. --------
  285. :func:`~gensim.matutils.corpus2csc`
  286. Convert corpus in Gensim format to `scipy.sparse.csc` matrix.
  287. """
  288. def __init__(self, vecs):
  289. """
  290. Parameters
  291. ----------
  292. vecs : iterable of {`numpy.ndarray`, `scipy.sparse`}
  293. Input vectors.
  294. """
  295. self.vecs = vecs
  296. def __iter__(self):
  297. for vec in self.vecs:
  298. if isinstance(vec, np.ndarray):
  299. yield full2sparse(vec)
  300. else:
  301. yield scipy2sparse(vec)
  302. def __len__(self):
  303. return len(self.vecs)
  304. def sparse2full(doc, length):
  305. """Convert a document in Gensim bag-of-words format into a dense numpy array.
  306. Parameters
  307. ----------
  308. doc : list of (int, number)
  309. Document in BoW format.
  310. length : int
  311. Vector dimensionality. This cannot be inferred from the BoW, and you must supply it explicitly.
  312. This is typically the vocabulary size or number of topics, depending on how you created `doc`.
  313. Returns
  314. -------
  315. numpy.ndarray
  316. Dense numpy vector for `doc`.
  317. See Also
  318. --------
  319. :func:`~gensim.matutils.full2sparse`
  320. Convert dense array to gensim bag-of-words format.
  321. """
  322. result = np.zeros(length, dtype=np.float32) # fill with zeroes (default value)
  323. # convert indices to int as numpy 1.12 no longer indexes by floats
  324. doc = ((int(id_), float(val_)) for (id_, val_) in doc)
  325. doc = dict(doc)
  326. # overwrite some of the zeroes with explicit values
  327. result[list(doc)] = list(itervalues(doc))
  328. return result
  329. def full2sparse(vec, eps=1e-9):
  330. """Convert a dense numpy array into the Gensim bag-of-words format.
  331. Parameters
  332. ----------
  333. vec : numpy.ndarray
  334. Dense input vector.
  335. eps : float
  336. Feature weight threshold value. Features with `abs(weight) < eps` are considered sparse and
  337. won't be included in the BOW result.
  338. Returns
  339. -------
  340. list of (int, float)
  341. BoW format of `vec`, with near-zero values omitted (sparse vector).
  342. See Also
  343. --------
  344. :func:`~gensim.matutils.sparse2full`
  345. Convert a document in Gensim bag-of-words format into a dense numpy array.
  346. """
  347. vec = np.asarray(vec, dtype=float)
  348. nnz = np.nonzero(abs(vec) > eps)[0]
  349. return list(zip(nnz, vec.take(nnz)))
  350. dense2vec = full2sparse
  351. def full2sparse_clipped(vec, topn, eps=1e-9):
  352. """Like :func:`~gensim.matutils.full2sparse`, but only return the `topn` elements of the greatest magnitude (abs).
  353. This is more efficient that sorting a vector and then taking the greatest values, especially
  354. where `len(vec) >> topn`.
  355. Parameters
  356. ----------
  357. vec : numpy.ndarray
  358. Input dense vector
  359. topn : int
  360. Number of greatest (abs) elements that will be presented in result.
  361. eps : float
  362. Threshold value, if coordinate in `vec` < eps, this will not be presented in result.
  363. Returns
  364. -------
  365. list of (int, float)
  366. Clipped vector in BoW format.
  367. See Also
  368. --------
  369. :func:`~gensim.matutils.full2sparse`
  370. Convert dense array to gensim bag-of-words format.
  371. """
  372. # use np.argpartition/argsort and only form tuples that are actually returned.
  373. # this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on.
  374. if topn <= 0:
  375. return []
  376. vec = np.asarray(vec, dtype=float)
  377. nnz = np.nonzero(abs(vec) > eps)[0]
  378. biggest = nnz.take(argsort(abs(vec).take(nnz), topn, reverse=True))
  379. return list(zip(biggest, vec.take(biggest)))
  380. def corpus2dense(corpus, num_terms, num_docs=None, dtype=np.float32):
  381. """Convert corpus into a dense numpy 2D array, with documents as columns.
  382. Parameters
  383. ----------
  384. corpus : iterable of iterable of (int, number)
  385. Input corpus in the Gensim bag-of-words format.
  386. num_terms : int
  387. Number of terms in the dictionary. X-axis of the resulting matrix.
  388. num_docs : int, optional
  389. Number of documents in the corpus. If provided, a slightly more memory-efficient code path is taken.
  390. Y-axis of the resulting matrix.
  391. dtype : data-type, optional
  392. Data type of the output matrix.
  393. Returns
  394. -------
  395. numpy.ndarray
  396. Dense 2D array that presents `corpus`.
  397. See Also
  398. --------
  399. :class:`~gensim.matutils.Dense2Corpus`
  400. Convert dense matrix to Gensim corpus format.
  401. """
  402. if num_docs is not None:
  403. # we know the number of documents => don't bother column_stacking
  404. docno, result = -1, np.empty((num_terms, num_docs), dtype=dtype)
  405. for docno, doc in enumerate(corpus):
  406. result[:, docno] = sparse2full(doc, num_terms)
  407. assert docno + 1 == num_docs
  408. else:
  409. result = np.column_stack(sparse2full(doc, num_terms) for doc in corpus)
  410. return result.astype(dtype)
  411. class Dense2Corpus(object):
  412. """Treat dense numpy array as a streamed Gensim corpus in the bag-of-words format.
  413. Notes
  414. -----
  415. No data copy is made (changes to the underlying matrix imply changes in the streamed corpus).
  416. See Also
  417. --------
  418. :func:`~gensim.matutils.corpus2dense`
  419. Convert Gensim corpus to dense matrix.
  420. :class:`~gensim.matutils.Sparse2Corpus`
  421. Convert sparse matrix to Gensim corpus format.
  422. """
  423. def __init__(self, dense, documents_columns=True):
  424. """
  425. Parameters
  426. ----------
  427. dense : numpy.ndarray
  428. Corpus in dense format.
  429. documents_columns : bool, optional
  430. Documents in `dense` represented as columns, as opposed to rows?
  431. """
  432. if documents_columns:
  433. self.dense = dense.T
  434. else:
  435. self.dense = dense
  436. def __iter__(self):
  437. """Iterate over the corpus.
  438. Yields
  439. ------
  440. list of (int, float)
  441. Document in BoW format.
  442. """
  443. for doc in self.dense:
  444. yield full2sparse(doc.flat)
  445. def __len__(self):
  446. return len(self.dense)
  447. class Sparse2Corpus(object):
  448. """Convert a matrix in scipy.sparse format into a streaming Gensim corpus.
  449. See Also
  450. --------
  451. :func:`~gensim.matutils.corpus2csc`
  452. Convert gensim corpus format to `scipy.sparse.csc` matrix
  453. :class:`~gensim.matutils.Dense2Corpus`
  454. Convert dense matrix to gensim corpus.
  455. """
  456. def __init__(self, sparse, documents_columns=True):
  457. """
  458. Parameters
  459. ----------
  460. sparse : `scipy.sparse`
  461. Corpus scipy sparse format
  462. documents_columns : bool, optional
  463. Documents will be column?
  464. """
  465. if documents_columns:
  466. self.sparse = sparse.tocsc()
  467. else:
  468. self.sparse = sparse.tocsr().T # make sure shape[1]=number of docs (needed in len())
  469. def __iter__(self):
  470. """
  471. Yields
  472. ------
  473. list of (int, float)
  474. Document in BoW format.
  475. """
  476. for indprev, indnow in izip(self.sparse.indptr, self.sparse.indptr[1:]):
  477. yield list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow]))
  478. def __len__(self):
  479. return self.sparse.shape[1]
  480. def __getitem__(self, document_index):
  481. """Retrieve a document vector from the corpus by its index.
  482. Parameters
  483. ----------
  484. document_index : int
  485. Index of document
  486. Returns
  487. -------
  488. list of (int, number)
  489. Document in BoW format.
  490. """
  491. indprev = self.sparse.indptr[document_index]
  492. indnow = self.sparse.indptr[document_index + 1]
  493. return list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow]))
  494. def veclen(vec):
  495. """Calculate L2 (euclidean) length of a vector.
  496. Parameters
  497. ----------
  498. vec : list of (int, number)
  499. Input vector in sparse bag-of-words format.
  500. Returns
  501. -------
  502. float
  503. Length of `vec`.
  504. """
  505. if len(vec) == 0:
  506. return 0.0
  507. length = 1.0 * math.sqrt(sum(val**2 for _, val in vec))
  508. assert length > 0.0, "sparse documents must not contain any explicit zero entries"
  509. return length
  510. def ret_normalized_vec(vec, length):
  511. """Normalize a vector in L2 (Euclidean unit norm).
  512. Parameters
  513. ----------
  514. vec : list of (int, number)
  515. Input vector in BoW format.
  516. length : float
  517. Length of vector
  518. Returns
  519. -------
  520. list of (int, number)
  521. L2-normalized vector in BoW format.
  522. """
  523. if length != 1.0:
  524. return [(termid, val / length) for termid, val in vec]
  525. else:
  526. return list(vec)
  527. def ret_log_normalize_vec(vec, axis=1):
  528. log_max = 100.0
  529. if len(vec.shape) == 1:
  530. max_val = np.max(vec)
  531. log_shift = log_max - np.log(len(vec) + 1.0) - max_val
  532. tot = np.sum(np.exp(vec + log_shift))
  533. log_norm = np.log(tot) - log_shift
  534. vec -= log_norm
  535. else:
  536. if axis == 1: # independently normalize each sample
  537. max_val = np.max(vec, 1)
  538. log_shift = log_max - np.log(vec.shape[1] + 1.0) - max_val
  539. tot = np.sum(np.exp(vec + log_shift[:, np.newaxis]), 1)
  540. log_norm = np.log(tot) - log_shift
  541. vec = vec - log_norm[:, np.newaxis]
  542. elif axis == 0: # normalize each feature
  543. k = ret_log_normalize_vec(vec.T)
  544. return k[0].T, k[1]
  545. else:
  546. raise ValueError("'%s' is not a supported axis" % axis)
  547. return vec, log_norm
  548. blas_nrm2 = blas('nrm2', np.array([], dtype=float))
  549. blas_scal = blas('scal', np.array([], dtype=float))
  550. def unitvec(vec, norm='l2', return_norm=False):
  551. """Scale a vector to unit length.
  552. Parameters
  553. ----------
  554. vec : {numpy.ndarray, scipy.sparse, list of (int, float)}
  555. Input vector in any format
  556. norm : {'l1', 'l2'}, optional
  557. Metric to normalize in.
  558. return_norm : bool, optional
  559. Return the length of vector `vec`, in addition to the normalized vector itself?
  560. Returns
  561. -------
  562. numpy.ndarray, scipy.sparse, list of (int, float)}
  563. Normalized vector in same format as `vec`.
  564. float
  565. Length of `vec` before normalization, if `return_norm` is set.
  566. Notes
  567. -----
  568. Zero-vector will be unchanged.
  569. """
  570. if norm not in ('l1', 'l2'):
  571. raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm)
  572. if scipy.sparse.issparse(vec):
  573. vec = vec.tocsr()
  574. if norm == 'l1':
  575. veclen = np.sum(np.abs(vec.data))
  576. if norm == 'l2':
  577. veclen = np.sqrt(np.sum(vec.data ** 2))
  578. if veclen > 0.0:
  579. if np.issubdtype(vec.dtype, np.int):
  580. vec = vec.astype(np.float)
  581. vec /= veclen
  582. if return_norm:
  583. return vec, veclen
  584. else:
  585. return vec
  586. else:
  587. if return_norm:
  588. return vec, 1.
  589. else:
  590. return vec
  591. if isinstance(vec, np.ndarray):
  592. if norm == 'l1':
  593. veclen = np.sum(np.abs(vec))
  594. if norm == 'l2':
  595. veclen = blas_nrm2(vec)
  596. if veclen > 0.0:
  597. if np.issubdtype(vec.dtype, np.int):
  598. vec = vec.astype(np.float)
  599. if return_norm:
  600. return blas_scal(1.0 / veclen, vec).astype(vec.dtype), veclen
  601. else:
  602. return blas_scal(1.0 / veclen, vec).astype(vec.dtype)
  603. else:
  604. if return_norm:
  605. return vec, 1
  606. else:
  607. return vec
  608. try:
  609. first = next(iter(vec)) # is there at least one element?
  610. except StopIteration:
  611. return vec
  612. if isinstance(first, (tuple, list)) and len(first) == 2: # gensim sparse format
  613. if norm == 'l1':
  614. length = float(sum(abs(val) for _, val in vec))
  615. if norm == 'l2':
  616. length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec))
  617. assert length > 0.0, "sparse documents must not contain any explicit zero entries"
  618. if return_norm:
  619. return ret_normalized_vec(vec, length), length
  620. else:
  621. return ret_normalized_vec(vec, length)
  622. else:
  623. raise ValueError("unknown input type")
  624. def cossim(vec1, vec2):
  625. """Get cosine similarity between two sparse vectors.
  626. Cosine similarity is a number between `<-1.0, 1.0>`, higher means more similar.
  627. Parameters
  628. ----------
  629. vec1 : list of (int, float)
  630. Vector in BoW format.
  631. vec2 : list of (int, float)
  632. Vector in BoW format.
  633. Returns
  634. -------
  635. float
  636. Cosine similarity between `vec1` and `vec2`.
  637. """
  638. vec1, vec2 = dict(vec1), dict(vec2)
  639. if not vec1 or not vec2:
  640. return 0.0
  641. vec1len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec1)))
  642. vec2len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec2)))
  643. assert vec1len > 0.0 and vec2len > 0.0, "sparse documents must not contain any explicit zero entries"
  644. if len(vec2) < len(vec1):
  645. vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector
  646. result = sum(value * vec2.get(index, 0.0) for index, value in iteritems(vec1))
  647. result /= vec1len * vec2len # rescale by vector lengths
  648. return result
  649. def softcossim(vec1, vec2, similarity_matrix):
  650. """Get Soft Cosine Measure between two vectors given a term similarity matrix.
  651. Return Soft Cosine Measure between two sparse vectors given a sparse term similarity matrix
  652. in the :class:`scipy.sparse.csc_matrix` format. The similarity is a number between `<-1.0, 1.0>`,
  653. higher is more similar.
  654. Notes
  655. -----
  656. Soft Cosine Measure was perhaps first defined by `Grigori Sidorov et al.,
  657. "Soft Similarity and Soft Cosine Measure: Similarity of Features in Vector Space Model"
  658. <http://www.cys.cic.ipn.mx/ojs/index.php/CyS/article/view/2043/1921>`_.
  659. Parameters
  660. ----------
  661. vec1 : list of (int, float)
  662. A query vector in the BoW format.
  663. vec2 : list of (int, float)
  664. A document vector in the BoW format.
  665. similarity_matrix : {:class:`scipy.sparse.csc_matrix`, :class:`scipy.sparse.csr_matrix`}
  666. A term similarity matrix, typically produced by
  667. :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix`.
  668. Returns
  669. -------
  670. `similarity_matrix.dtype`
  671. The Soft Cosine Measure between `vec1` and `vec2`.
  672. Raises
  673. ------
  674. ValueError
  675. When the term similarity matrix is in an unknown format.
  676. See Also
  677. --------
  678. :meth:`gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix`
  679. A term similarity matrix produced from term embeddings.
  680. :class:`gensim.similarities.docsim.SoftCosineSimilarity`
  681. A class for performing corpus-based similarity queries with Soft Cosine Measure.
  682. """
  683. if not isinstance(similarity_matrix, scipy.sparse.csc_matrix):
  684. if isinstance(similarity_matrix, scipy.sparse.csr_matrix):
  685. similarity_matrix = similarity_matrix.T
  686. else:
  687. raise ValueError('unknown similarity matrix format')
  688. if not vec1 or not vec2:
  689. return 0.0
  690. vec1 = dict(vec1)
  691. vec2 = dict(vec2)
  692. word_indices = sorted(set(chain(vec1, vec2)))
  693. dtype = similarity_matrix.dtype
  694. vec1 = np.array([vec1[i] if i in vec1 else 0 for i in word_indices], dtype=dtype)
  695. vec2 = np.array([vec2[i] if i in vec2 else 0 for i in word_indices], dtype=dtype)
  696. dense_matrix = similarity_matrix[[[i] for i in word_indices], word_indices].todense()
  697. vec1len = vec1.T.dot(dense_matrix).dot(vec1)[0, 0]
  698. vec2len = vec2.T.dot(dense_matrix).dot(vec2)[0, 0]
  699. assert \
  700. vec1len > 0.0 and vec2len > 0.0, \
  701. u"sparse documents must not contain any explicit zero entries and the similarity matrix S " \
  702. u"must satisfy x^T * S * x > 0 for any nonzero bag-of-words vector x."
  703. result = vec1.T.dot(dense_matrix).dot(vec2)[0, 0]
  704. result /= math.sqrt(vec1len) * math.sqrt(vec2len) # rescale by vector lengths
  705. return np.clip(result, -1.0, 1.0)
  706. def isbow(vec):
  707. """Checks if a vector is in the sparse Gensim bag-of-words format.
  708. Parameters
  709. ----------
  710. vec : object
  711. Object to check.
  712. Returns
  713. -------
  714. bool
  715. Is `vec` in BoW format.
  716. """
  717. if scipy.sparse.issparse(vec):
  718. vec = vec.todense().tolist()
  719. try:
  720. id_, val_ = vec[0] # checking first value to see if it is in bag of words format by unpacking
  721. int(id_), float(val_)
  722. except IndexError:
  723. return True # this is to handle the empty input case
  724. except (ValueError, TypeError):
  725. return False
  726. return True
  727. def _convert_vec(vec1, vec2, num_features=None):
  728. if scipy.sparse.issparse(vec1):
  729. vec1 = vec1.toarray()
  730. if scipy.sparse.issparse(vec2):
  731. vec2 = vec2.toarray() # converted both the vectors to dense in case they were in sparse matrix
  732. if isbow(vec1) and isbow(vec2): # if they are in bag of words format we make it dense
  733. if num_features is not None: # if not None, make as large as the documents drawing from
  734. dense1 = sparse2full(vec1, num_features)
  735. dense2 = sparse2full(vec2, num_features)
  736. return dense1, dense2
  737. else:
  738. max_len = max(len(vec1), len(vec2))
  739. dense1 = sparse2full(vec1, max_len)
  740. dense2 = sparse2full(vec2, max_len)
  741. return dense1, dense2
  742. else:
  743. # this conversion is made because if it is not in bow format, it might be a list within a list after conversion
  744. # the scipy implementation of Kullback fails in such a case so we pick up only the nested list.
  745. if len(vec1) == 1:
  746. vec1 = vec1[0]
  747. if len(vec2) == 1:
  748. vec2 = vec2[0]
  749. return vec1, vec2
  750. def kullback_leibler(vec1, vec2, num_features=None):
  751. """Calculate Kullback-Leibler distance between two probability distributions using `scipy.stats.entropy`.
  752. Parameters
  753. ----------
  754. vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
  755. Distribution vector.
  756. vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
  757. Distribution vector.
  758. num_features : int, optional
  759. Number of features in the vectors.
  760. Returns
  761. -------
  762. float
  763. Kullback-Leibler distance between `vec1` and `vec2`.
  764. Value in range [0, +) where values closer to 0 mean less distance (higher similarity).
  765. """
  766. vec1, vec2 = _convert_vec(vec1, vec2, num_features=num_features)
  767. return entropy(vec1, vec2)
  768. def jensen_shannon(vec1, vec2, num_features=None):
  769. """Calculate Jensen-Shannon distance between two probability distributions using `scipy.stats.entropy`.
  770. Parameters
  771. ----------
  772. vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
  773. Distribution vector.
  774. vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
  775. Distribution vector.
  776. num_features : int, optional
  777. Number of features in the vectors.
  778. Returns
  779. -------
  780. float
  781. Jensen-Shannon distance between `vec1` and `vec2`.
  782. Notes
  783. -----
  784. This is a symmetric and finite "version" of :func:`gensim.matutils.kullback_leibler`.
  785. """
  786. vec1, vec2 = _convert_vec(vec1, vec2, num_features=num_features)
  787. avg_vec = 0.5 * (vec1 + vec2)
  788. return 0.5 * (entropy(vec1, avg_vec) + entropy(vec2, avg_vec))
  789. def hellinger(vec1, vec2):
  790. """Calculate Hellinger distance between two probability distributions.
  791. Parameters
  792. ----------
  793. vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
  794. Distribution vector.
  795. vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
  796. Distribution vector.
  797. Returns
  798. -------
  799. float
  800. Hellinger distance between `vec1` and `vec2`.
  801. Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity).
  802. """
  803. if scipy.sparse.issparse(vec1):
  804. vec1 = vec1.toarray()
  805. if scipy.sparse.issparse(vec2):
  806. vec2 = vec2.toarray()
  807. if isbow(vec1) and isbow(vec2):
  808. # if it is a BoW format, instead of converting to dense we use dictionaries to calculate appropriate distance
  809. vec1, vec2 = dict(vec1), dict(vec2)
  810. indices = set(list(vec1.keys()) + list(vec2.keys()))
  811. sim = np.sqrt(
  812. 0.5 * sum((np.sqrt(vec1.get(index, 0.0)) - np.sqrt(vec2.get(index, 0.0)))**2 for index in indices)
  813. )
  814. return sim
  815. else:
  816. sim = np.sqrt(0.5 * ((np.sqrt(vec1) - np.sqrt(vec2))**2).sum())
  817. return sim
  818. def jaccard(vec1, vec2):
  819. """Calculate Jaccard distance between two vectors.
  820. Parameters
  821. ----------
  822. vec1 : {scipy.sparse, numpy.ndarray, list of (int, float)}
  823. Distribution vector.
  824. vec2 : {scipy.sparse, numpy.ndarray, list of (int, float)}
  825. Distribution vector.
  826. Returns
  827. -------
  828. float
  829. Jaccard distance between `vec1` and `vec2`.
  830. Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity).
  831. """
  832. # converting from sparse for easier manipulation
  833. if scipy.sparse.issparse(vec1):
  834. vec1 = vec1.toarray()
  835. if scipy.sparse.issparse(vec2):
  836. vec2 = vec2.toarray()
  837. if isbow(vec1) and isbow(vec2):
  838. # if it's in bow format, we use the following definitions:
  839. # union = sum of the 'weights' of both the bags
  840. # intersection = lowest weight for a particular id; basically the number of common words or items
  841. union = sum(weight for id_, weight in vec1) + sum(weight for id_, weight in vec2)
  842. vec1, vec2 = dict(vec1), dict(vec2)
  843. intersection = 0.0
  844. for feature_id, feature_weight in iteritems(vec1):
  845. intersection += min(feature_weight, vec2.get(feature_id, 0.0))
  846. return 1 - float(intersection) / float(union)
  847. else:
  848. # if it isn't in bag of words format, we can use sets to calculate intersection and union
  849. if isinstance(vec1, np.ndarray):
  850. vec1 = vec1.tolist()
  851. if isinstance(vec2, np.ndarray):
  852. vec2 = vec2.tolist()
  853. vec1 = set(vec1)
  854. vec2 = set(vec2)
  855. intersection = vec1 & vec2
  856. union = vec1 | vec2
  857. return 1 - float(len(intersection)) / float(len(union))
  858. def jaccard_distance(set1, set2):
  859. """Calculate Jaccard distance between two sets.
  860. Parameters
  861. ----------
  862. set1 : set
  863. Input set.
  864. set2 : set
  865. Input set.
  866. Returns
  867. -------
  868. float
  869. Jaccard distance between `set1` and `set2`.
  870. Value in range `[0, 1]`, where 0 is min distance (max similarity) and 1 is max distance (min similarity).
  871. """
  872. union_cardinality = len(set1 | set2)
  873. if union_cardinality == 0: # Both sets are empty
  874. return 1.
  875. return 1. - float(len(set1 & set2)) / float(union_cardinality)
  876. try:
  877. # try to load fast, cythonized code if possible
  878. from gensim._matutils import logsumexp, mean_absolute_difference, dirichlet_expectation
  879. except ImportError:
  880. def logsumexp(x):
  881. """Log of sum of exponentials.
  882. Parameters
  883. ----------
  884. x : numpy.ndarray
  885. Input 2d matrix.
  886. Returns
  887. -------
  888. float
  889. log of sum of exponentials of elements in `x`.
  890. Warnings
  891. --------
  892. For performance reasons, doesn't support NaNs or 1d, 3d, etc arrays like :func:`scipy.special.logsumexp`.
  893. """
  894. x_max = np.max(x)
  895. x = np.log(np.sum(np.exp(x - x_max)))
  896. x += x_max
  897. return x
  898. def mean_absolute_difference(a, b):
  899. """Mean absolute difference between two arrays.
  900. Parameters
  901. ----------
  902. a : numpy.ndarray
  903. Input 1d array.
  904. b : numpy.ndarray
  905. Input 1d array.
  906. Returns
  907. -------
  908. float
  909. mean(abs(a - b)).
  910. """
  911. return np.mean(np.abs(a - b))
  912. def dirichlet_expectation(alpha):
  913. """Expected value of log(theta) where theta is drawn from a Dirichlet distribution.
  914. Parameters
  915. ----------
  916. alpha : numpy.ndarray
  917. Dirichlet parameter 2d matrix or 1d vector, if 2d - each row is treated as a separate parameter vector.
  918. Returns
  919. -------
  920. numpy.ndarray
  921. Log of expected values, dimension same as `alpha.ndim`.
  922. """
  923. if len(alpha.shape) == 1:
  924. result = psi(alpha) - psi(np.sum(alpha))
  925. else:
  926. result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]
  927. return result.astype(alpha.dtype, copy=False) # keep the same precision as input
  928. def qr_destroy(la):
  929. """Get QR decomposition of `la[0]`.
  930. Parameters
  931. ----------
  932. la : list of numpy.ndarray
  933. Run QR decomposition on the first elements of `la`. Must not be empty.
  934. Returns
  935. -------
  936. (numpy.ndarray, numpy.ndarray)
  937. Matrices :math:`Q` and :math:`R`.
  938. Notes
  939. -----
  940. Using this function is less memory intense than calling `scipy.linalg.qr(la[0])`,
  941. because the memory used in `la[0]` is reclaimed earlier. This makes a difference when
  942. decomposing very large arrays, where every memory copy counts.
  943. Warnings
  944. --------
  945. Content of `la` as well as `la[0]` gets destroyed in the process. Again, for memory-effiency reasons.
  946. """
  947. a = np.asfortranarray(la[0])
  948. del la[0], la # now `a` is the only reference to the input matrix
  949. m, n = a.shape
  950. # perform q, r = QR(a); code hacked out of scipy.linalg.qr
  951. logger.debug("computing QR of %s dense matrix", str(a.shape))
  952. geqrf, = get_lapack_funcs(('geqrf',), (a,))
  953. qr, tau, work, info = geqrf(a, lwork=-1, overwrite_a=True)
  954. qr, tau, work, info = geqrf(a, lwork=work[0], overwrite_a=True)
  955. del a # free up mem
  956. assert info >= 0
  957. r = triu(qr[:n, :n])
  958. if m < n: # rare case, #features < #topics
  959. qr = qr[:, :m] # retains fortran order
  960. gorgqr, = get_lapack_funcs(('orgqr',), (qr,))
  961. q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True)
  962. q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True)
  963. assert info >= 0, "qr failed"
  964. assert q.flags.f_contiguous
  965. return q, r
  966. class MmWriter(object):
  967. """Store a corpus in `Matrix Market format <https://math.nist.gov/MatrixMarket/formats.html>`_,
  968. using :class:`~gensim.corpora.mmcorpus.MmCorpus`.
  969. Notes
  970. -----
  971. The output is written one document at a time, not the whole matrix at once (unlike e.g. `scipy.io.mmread`).
  972. This allows you to write corpora which are larger than the available RAM.
  973. The output file is created in a single pass through the input corpus, so that the input can be
  974. a once-only stream (generator).
  975. To achieve this, a fake MM header is written first, corpus statistics are collected
  976. during the pass (shape of the matrix, number of non-zeroes), followed by a seek back to the beginning of the file,
  977. rewriting the fake header with the final values.
  978. """
  979. HEADER_LINE = b'%%MatrixMarket matrix coordinate real general\n' # the only supported MM format
  980. def __init__(self, fname):
  981. """
  982. Parameters
  983. ----------
  984. fname : str
  985. Path to output file.
  986. """
  987. self.fname = fname
  988. if fname.endswith(".gz") or fname.endswith('.bz2'):
  989. raise NotImplementedError("compressed output not supported with MmWriter")
  990. self.fout = utils.smart_open(self.fname, 'wb+') # open for both reading and writing
  991. self.headers_written = False
  992. def write_headers(self, num_docs, num_terms, num_nnz):
  993. """Write headers to file.
  994. Parameters
  995. ----------
  996. num_docs : int
  997. Number of documents in corpus.
  998. num_terms : int
  999. Number of term in corpus.
  1000. num_nnz : int
  1001. Number of non-zero elements in corpus.
  1002. """
  1003. self.fout.write(MmWriter.HEADER_LINE)
  1004. if num_nnz < 0:
  1005. # we don't know the matrix shape/density yet, so only log a general line
  1006. logger.info("saving sparse matrix to %s", self.fname)
  1007. self.fout.write(utils.to_utf8(' ' * 50 + '\n')) # 48 digits must be enough for everybody
  1008. else:
  1009. logger.info(
  1010. "saving sparse %sx%s matrix with %i non-zero entries to %s",
  1011. num_docs, num_terms, num_nnz, self.fname
  1012. )
  1013. self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz)))
  1014. self.last_docno = -1
  1015. self.headers_written = True
  1016. def fake_headers(self, num_docs, num_terms, num_nnz):
  1017. """Write "fake" headers to file, to be rewritten once we've scanned the entire corpus.
  1018. Parameters
  1019. ----------
  1020. num_docs : int
  1021. Number of documents in corpus.
  1022. num_terms : int
  1023. Number of term in corpus.
  1024. num_nnz : int
  1025. Number of non-zero elements in corpus.
  1026. """
  1027. stats = '%i %i %i' % (num_docs, num_terms, num_nnz)
  1028. if len(stats) > 50:
  1029. raise ValueError('Invalid stats: matrix too large!')
  1030. self.fout.seek(len(MmWriter.HEADER_LINE))
  1031. self.fout.write(utils.to_utf8(stats))
  1032. def write_vector(self, docno, vector):
  1033. """Write a single sparse vector to the file.
  1034. Parameters
  1035. ----------
  1036. docno : int
  1037. Number of document.
  1038. vector : list of (int, number)
  1039. Document in BoW format.
  1040. Returns
  1041. -------
  1042. (int, int)
  1043. Max word index in vector and len of vector. If vector is empty, return (-1, 0).
  1044. """
  1045. assert self.headers_written, "must write Matrix Market file headers before writing data!"
  1046. assert self.last_docno < docno, "documents %i and %i not in sequential order!" % (self.last_docno, docno)
  1047. vector = sorted((i, w) for i, w in vector if abs(w) > 1e-12) # ignore near-zero entries
  1048. for termid, weight in vector: # write term ids in sorted order
  1049. # +1 because MM format starts counting from 1
  1050. self.fout.write(utils.to_utf8("%i %i %s\n" % (docno + 1, termid + 1, weight)))
  1051. self.last_docno = docno
  1052. return (vector[-1][0], len(vector)) if vector else (-1, 0)
  1053. @staticmethod
  1054. def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, metadata=False):
  1055. """Save the corpus to disk in `Matrix Market format <https://math.nist.gov/MatrixMarket/formats.html>`_.
  1056. Parameters
  1057. ----------
  1058. fname : str
  1059. Filename of the resulting file.
  1060. corpus : iterable of list of (int, number)
  1061. Corpus in streamed bag-of-words format.
  1062. progress_cnt : int, optional
  1063. Print progress for every `progress_cnt` number of documents.
  1064. index : bool, optional
  1065. Return offsets?
  1066. num_terms : int, optional
  1067. Number of terms in the corpus. If provided, the `corpus.num_terms` attribute (if any) will be ignored.
  1068. metadata : bool, optional
  1069. Generate a metadata file?
  1070. Returns
  1071. -------
  1072. offsets : {list of int, None}
  1073. List of offsets (if index=True) or nothing.
  1074. Notes
  1075. -----
  1076. Documents are processed one at a time, so the whole corpus is allowed to be larger than the available RAM.
  1077. See Also
  1078. --------
  1079. :func:`gensim.corpora.mmcorpus.MmCorpus.save_corpus`
  1080. Save corpus to disk.
  1081. """
  1082. mw = MmWriter(fname)
  1083. # write empty headers to the file (with enough space to be overwritten later)
  1084. mw.write_headers(-1, -1, -1) # will print 50 spaces followed by newline on the stats line
  1085. # calculate necessary header info (nnz elements, num terms, num docs) while writing out vectors
  1086. _num_terms, num_nnz = 0, 0
  1087. docno, poslast = -1, -1
  1088. offsets = []
  1089. if hasattr(corpus, 'metadata'):
  1090. orig_metadata = corpus.metadata
  1091. corpus.metadata = metadata
  1092. if metadata:
  1093. docno2metadata = {}
  1094. else:
  1095. metadata = False
  1096. for docno, doc in enumerate(corpus):
  1097. if metadata:
  1098. bow, data = doc
  1099. docno2metadata[docno] = data
  1100. else:
  1101. bow = doc
  1102. if docno % progress_cnt == 0:
  1103. logger.info("PROGRESS: saving document #%i", docno)
  1104. if index:
  1105. posnow = mw.fout.tell()
  1106. if posnow == poslast:
  1107. offsets[-1] = -1
  1108. offsets.append(posnow)
  1109. poslast = posnow
  1110. max_id, veclen = mw.write_vector(docno, bow)
  1111. _num_terms = max(_num_terms, 1 + max_id)
  1112. num_nnz += veclen
  1113. if metadata:
  1114. utils.pickle(docno2metadata, fname + '.metadata.cpickle')
  1115. corpus.metadata = orig_metadata
  1116. num_docs = docno + 1
  1117. num_terms = num_terms or _num_terms
  1118. if num_docs * num_terms != 0:
  1119. logger.info(
  1120. "saved %ix%i matrix, density=%.3f%% (%i/%i)",
  1121. num_docs, num_terms, 100.0 * num_nnz / (num_docs * num_terms), num_nnz, num_docs * num_terms
  1122. )
  1123. # now write proper headers, by seeking and overwriting the spaces written earlier
  1124. mw.fake_headers(num_docs, num_terms, num_nnz)
  1125. mw.close()
  1126. if index:
  1127. return offsets
  1128. def __del__(self):
  1129. """Close `self.fout` file. Alias for :meth:`~gensim.matutils.MmWriter.close`.
  1130. Warnings
  1131. --------
  1132. Closing the file explicitly via the close() method is preferred and safer.
  1133. """
  1134. self.close() # does nothing if called twice (on an already closed file), so no worries
  1135. def close(self):
  1136. """Close `self.fout` file."""
  1137. logger.debug("closing %s", self.fname)
  1138. if hasattr(self, 'fout'):
  1139. self.fout.close()
  1140. try:
  1141. # try to load fast, cythonized code if possible
  1142. from gensim.corpora._mmreader import MmReader
  1143. except ImportError:
  1144. FAST_VERSION = -1
  1145. class MmReader(object):
  1146. """Matrix market file reader, used internally in :class:`~gensim.corpora.mmcorpus.MmCorpus`.
  1147. Wrap a term-document matrix on disk (in matrix-market format), and present it
  1148. as an object which supports iteration over the rows (~documents).
  1149. Attributes
  1150. ----------
  1151. num_docs : int
  1152. Number of documents in market matrix file.
  1153. num_terms : int
  1154. Number of terms.
  1155. num_nnz : int
  1156. Number of non-zero terms.
  1157. Notes
  1158. -----
  1159. Note that the file is read into memory one document at a time, not the whole matrix at once
  1160. (unlike e.g. `scipy.io.mmread` and other implementations).
  1161. This allows us to process corpora which are larger than the available RAM.
  1162. """
  1163. def __init__(self, input, transposed=True):
  1164. """
  1165. Parameters
  1166. ----------
  1167. input : {str, file-like object}
  1168. Path to the input file in MM format or a file-like object that supports `seek()`
  1169. (e.g. smart_open objects).
  1170. transposed : bool, optional
  1171. Do lines represent `doc_id, term_id, value`, instead of `term_id, doc_id, value`?
  1172. """
  1173. logger.info("initializing corpus reader from %s", input)
  1174. self.input, self.transposed = input, transposed
  1175. with utils.open_file(self.input) as lines:
  1176. try:
  1177. header = utils.to_unicode(next(lines)).strip()
  1178. if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
  1179. raise ValueError(
  1180. "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
  1181. (self.input, header)
  1182. )
  1183. except StopIteration:
  1184. pass
  1185. self.num_docs = self.num_terms = self.num_nnz = 0
  1186. for lineno, line in enumerate(lines):
  1187. line = utils.to_unicode(line)
  1188. if not line.startswith('%'):
  1189. self.num_docs, self.num_terms, self.num_nnz = (int(x) for x in line.split())
  1190. if not self.transposed:
  1191. self.num_docs, self.num_terms = self.num_terms, self.num_docs
  1192. break
  1193. logger.info(
  1194. "accepted corpus with %i documents, %i features, %i non-zero entries",
  1195. self.num_docs, self.num_terms, self.num_nnz
  1196. )
  1197. def __len__(self):
  1198. """Get the corpus size: total number of documents."""
  1199. return self.num_docs
  1200. def __str__(self):
  1201. return ("MmCorpus(%i documents, %i features, %i non-zero entries)" %
  1202. (self.num_docs, self.num_terms, self.num_nnz))
  1203. def skip_headers(self, input_file):
  1204. """Skip file headers that appear before the first document.
  1205. Parameters
  1206. ----------
  1207. input_file : iterable of str
  1208. Iterable taken from file in MM format.
  1209. """
  1210. for line in input_file:
  1211. if line.startswith(b'%'):
  1212. continue
  1213. break
  1214. def __iter__(self):
  1215. """Iterate through all documents in the corpus.
  1216. Notes
  1217. ------
  1218. Note that the total number of vectors returned is always equal to the number of rows specified
  1219. in the header: empty documents are inserted and yielded where appropriate, even if they are not explicitly
  1220. stored in the Matrix Market file.
  1221. Yields
  1222. ------
  1223. (int, list of (int, number))
  1224. Document id and document in sparse bag-of-words format.
  1225. """
  1226. with utils.file_or_filename(self.input) as lines:
  1227. self.skip_headers(lines)
  1228. previd = -1
  1229. for line in lines:
  1230. docid, termid, val = utils.to_unicode(line).split() # needed for python3
  1231. if not self.transposed:
  1232. termid, docid = docid, termid
  1233. # -1 because matrix market indexes are 1-based => convert to 0-based
  1234. docid, termid, val = int(docid) - 1, int(termid) - 1, float(val)
  1235. assert previd <= docid, "matrix columns must come in ascending order"
  1236. if docid != previd:
  1237. # change of document: return the document read so far (its id is prevId)
  1238. if previd >= 0:
  1239. yield previd, document # noqa:F821
  1240. # return implicit (empty) documents between previous id and new id
  1241. # too, to keep consistent document numbering and corpus length
  1242. for previd in xrange(previd + 1, docid):
  1243. yield previd, []
  1244. # from now on start adding fields to a new document, with a new id
  1245. previd = docid
  1246. document = []
  1247. document.append((termid, val,)) # add another field to the current document
  1248. # handle the last document, as a special case
  1249. if previd >= 0:
  1250. yield previd, document
  1251. # return empty documents between the last explicit document and the number
  1252. # of documents as specified in the header
  1253. for previd in xrange(previd + 1, self.num_docs):
  1254. yield previd, []
  1255. def docbyoffset(self, offset):
  1256. """Get the document at file offset `offset` (in bytes).
  1257. Parameters
  1258. ----------
  1259. offset : int
  1260. File offset, in bytes, of the desired document.
  1261. Returns
  1262. ------
  1263. list of (int, str)
  1264. Document in sparse bag-of-words format.
  1265. """
  1266. # empty documents are not stored explicitly in MM format, so the index marks
  1267. # them with a special offset, -1.
  1268. if offset == -1:
  1269. return []
  1270. if isinstance(self.input, string_types):
  1271. fin, close_fin = utils.smart_open(self.input), True
  1272. else:
  1273. fin, close_fin = self.input, False
  1274. fin.seek(offset) # works for gzip/bz2 input, too
  1275. previd, document = -1, []
  1276. for line in fin:
  1277. docid, termid, val = line.split()
  1278. if not self.transposed:
  1279. termid, docid = docid, termid
  1280. # -1 because matrix market indexes are 1-based => convert to 0-based
  1281. docid, termid, val = int(docid) - 1, int(termid) - 1, float(val)
  1282. assert previd <= docid, "matrix columns must come in ascending order"
  1283. if docid != previd:
  1284. if previd >= 0:
  1285. break
  1286. previd = docid
  1287. document.append((termid, val,)) # add another field to the current document
  1288. if close_fin:
  1289. fin.close()
  1290. return document