You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

391 lines
15 KiB

4 years ago
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. #
  4. # Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
  5. # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
  6. """Basic interfaces used across the whole Gensim package.
  7. These interfaces are used for building corpora, model transformation and similarity queries.
  8. The interfaces are realized as abstract base classes. This means some functionality is already
  9. provided in the interface itself, and subclasses should inherit from these interfaces
  10. and implement the missing methods.
  11. """
  12. from __future__ import with_statement
  13. import logging
  14. from gensim import utils, matutils
  15. from six.moves import xrange
  16. logger = logging.getLogger(__name__)
  17. class CorpusABC(utils.SaveLoad):
  18. """Interface for corpus classes from :mod:`gensim.corpora`.
  19. Corpus is simply an iterable object, where each iteration step yields one document:
  20. >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class
  21. >>> from gensim.test.utils import datapath
  22. >>>
  23. >>> corpus = MmCorpus(datapath("testcorpus.mm"))
  24. >>> for doc in corpus:
  25. ... pass # do something with the doc...
  26. A document represented in bag-of-word (BoW) format, i.e. list of (attr_id, attr_value),
  27. like ``[(1, 0.2), (4, 0.6), ...]``.
  28. >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class
  29. >>> from gensim.test.utils import datapath
  30. >>>
  31. >>> corpus = MmCorpus(datapath("testcorpus.mm"))
  32. >>> doc = next(iter(corpus))
  33. >>> print(doc)
  34. [(0, 1.0), (1, 1.0), (2, 1.0)]
  35. Remember, that save/load methods save only corpus class (not corpus as data itself),
  36. for save/load functionality, please use this pattern :
  37. >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class
  38. >>> from gensim.test.utils import datapath, get_tmpfile
  39. >>>
  40. >>> corpus = MmCorpus(datapath("testcorpus.mm"))
  41. >>> tmp_path = get_tmpfile("temp_corpus.mm")
  42. >>>
  43. >>> MmCorpus.serialize(tmp_path, corpus) # serialize corpus to disk in MmCorpus format
  44. >>> # MmCorpus.save_corpus(tmp_path, corpus) # this variant also possible, but if serialize availbe - call it.
  45. >>> loaded_corpus = MmCorpus(tmp_path) # load corpus through constructor
  46. >>> for (doc_1, doc_2) in zip(corpus, loaded_corpus):
  47. ... assert doc_1 == doc_2 # check that corpuses exactly same
  48. See Also
  49. --------
  50. :mod:`gensim.corpora`
  51. Corpuses in different formats
  52. """
  53. def __iter__(self):
  54. """Iterate all over corpus."""
  55. raise NotImplementedError('cannot instantiate abstract base class')
  56. def save(self, *args, **kwargs):
  57. """Saves corpus in-memory state.
  58. Warnings
  59. --------
  60. This save only the "state" of a corpus class, not the corpus data!
  61. For saving data use the `serialize` method of the output format you'd like to use
  62. (e.g. :meth:`gensim.corpora.mmcorpus.MmCorpus.serialize`).
  63. """
  64. import warnings
  65. warnings.warn(
  66. "corpus.save() stores only the (tiny) iteration object in memory; "
  67. "to serialize the actual corpus content, use e.g. MmCorpus.serialize(corpus)"
  68. )
  69. super(CorpusABC, self).save(*args, **kwargs)
  70. def __len__(self):
  71. """Get the corpus size = the total number of documents in it."""
  72. raise NotImplementedError("must override __len__() before calling len(corpus)")
  73. @staticmethod
  74. def save_corpus(fname, corpus, id2word=None, metadata=False):
  75. """Save `corpus` to disk.
  76. Some formats support saving the dictionary (`feature_id -> word` mapping),
  77. which can be provided by the optional `id2word` parameter.
  78. Notes
  79. -----
  80. Some corpora also support random access via document indexing, so that the documents on disk
  81. can be accessed in O(1) time (see the :class:`gensim.corpora.indexedcorpus.IndexedCorpus` base class).
  82. In this case, :meth:`~gensim.interfaces.CorpusABC.save_corpus` is automatically called internally by
  83. :func:`serialize`, which does :meth:`~gensim.interfaces.CorpusABC.save_corpus` plus saves the index
  84. at the same time.
  85. Calling :func:`serialize() is preferred to calling :meth:`gensim.interfaces.CorpusABC.save_corpus`.
  86. Parameters
  87. ----------
  88. fname : str
  89. Path to output file.
  90. corpus : iterable of list of (int, number)
  91. Corpus in BoW format.
  92. id2word : :class:`~gensim.corpora.Dictionary`, optional
  93. Dictionary of corpus.
  94. metadata : bool, optional
  95. Write additional metadata to a separate too?
  96. """
  97. raise NotImplementedError('cannot instantiate abstract base class')
  98. class TransformedCorpus(CorpusABC):
  99. """Interface for corpora that are the result of an online (streamed) transformation."""
  100. def __init__(self, obj, corpus, chunksize=None, **kwargs):
  101. """
  102. Parameters
  103. ----------
  104. obj : object
  105. A transformation :class:`~gensim.interfaces.TransformationABC` object that will be applied
  106. to each document from `corpus` during iteration.
  107. corpus : iterable of list of (int, number)
  108. Corpus in bag-of-words format.
  109. chunksize : int, optional
  110. If provided, a slightly more effective processing will be performed by grouping documents from `corpus`.
  111. """
  112. self.obj, self.corpus, self.chunksize = obj, corpus, chunksize
  113. # add the new parameters like per_word_topics to base class object of LdaModel
  114. for key, value in kwargs.items():
  115. setattr(self.obj, key, value)
  116. self.metadata = False
  117. def __len__(self):
  118. """Get corpus size."""
  119. return len(self.corpus)
  120. def __iter__(self):
  121. """Iterate over the corpus, applying the selected transformation.
  122. If `chunksize` was set in the constructor, works in "batch-manner" (more efficient).
  123. Yields
  124. ------
  125. list of (int, number)
  126. Documents in the sparse Gensim bag-of-words format.
  127. """
  128. if self.chunksize:
  129. for chunk in utils.grouper(self.corpus, self.chunksize):
  130. for transformed in self.obj.__getitem__(chunk, chunksize=None):
  131. yield transformed
  132. else:
  133. for doc in self.corpus:
  134. yield self.obj[doc]
  135. def __getitem__(self, docno):
  136. """Transform the document at position `docno` within `corpus` specified in the constructor.
  137. Parameters
  138. ----------
  139. docno : int
  140. Position of the document to transform. Document offset inside `self.corpus`.
  141. Notes
  142. -----
  143. `self.corpus` must support random indexing.
  144. Returns
  145. -------
  146. list of (int, number)
  147. Transformed document in the sparse Gensim bag-of-words format.
  148. Raises
  149. ------
  150. RuntimeError
  151. If corpus doesn't support index slicing (`__getitem__` doesn't exists).
  152. """
  153. if hasattr(self.corpus, '__getitem__'):
  154. return self.obj[self.corpus[docno]]
  155. else:
  156. raise RuntimeError('Type {} does not support slicing.'.format(type(self.corpus)))
  157. class TransformationABC(utils.SaveLoad):
  158. """Transformation interface.
  159. A 'transformation' is any object which accepts document in BoW format via the `__getitem__` (notation `[]`)
  160. and returns another sparse document in its stead:
  161. >>> from gensim.models import LsiModel
  162. >>> from gensim.test.utils import common_dictionary, common_corpus
  163. >>>
  164. >>> model = LsiModel(common_corpus, id2word=common_dictionary)
  165. >>> bow_vector = model[common_corpus[0]] # model applied through __getitem__ on one document from corpus.
  166. >>> bow_corpus = model[common_corpus] # also, we can apply model on the full corpus
  167. """
  168. def __getitem__(self, vec):
  169. """Transform a single document, or a whole corpus, from one vector space into another.
  170. Parameters
  171. ----------
  172. vec : {list of (int, number), iterable of list of (int, number)}
  173. Document in bag-of-words, or streamed corpus.
  174. """
  175. raise NotImplementedError('cannot instantiate abstract base class')
  176. def _apply(self, corpus, chunksize=None, **kwargs):
  177. """Apply the transformation to a whole corpus and get the result as another corpus.
  178. Parameters
  179. ----------
  180. corpus : iterable of list of (int, number)
  181. Corpus in sparse Gensim bag-of-words format.
  182. chunksize : int, optional
  183. If provided, a more effective processing will performed.
  184. Returns
  185. -------
  186. :class:`~gensim.interfaces.TransformedCorpus`
  187. Transformed corpus.
  188. """
  189. return TransformedCorpus(self, corpus, chunksize, **kwargs)
  190. class SimilarityABC(utils.SaveLoad):
  191. """Interface for similarity search over a corpus.
  192. In all instances, there is a corpus against which we want to perform the similarity search.
  193. For each similarity search, the input is a document or a corpus, and the output are the similarities
  194. to individual corpus documents.
  195. Examples
  196. --------
  197. >>> from gensim.similarities import MatrixSimilarity
  198. >>> from gensim.test.utils import common_dictionary, common_corpus
  199. >>>
  200. >>> index = MatrixSimilarity(common_corpus)
  201. >>> similarities = index.get_similarities(common_corpus[1]) # get similarities between query and corpus
  202. Notes
  203. -----
  204. There is also a convenience wrapper, where iterating over `self` yields similarities of each document in the corpus
  205. against the whole corpus (i.e. the query is each corpus document in turn).
  206. See Also
  207. --------
  208. :mod:`gensim.similarities`
  209. Different index implementations of this interface.
  210. """
  211. def __init__(self, corpus):
  212. """
  213. Parameters
  214. ----------
  215. corpus : iterable of list of (int, number)
  216. Corpus in sparse Gensim bag-of-words format.
  217. """
  218. raise NotImplementedError("cannot instantiate Abstract Base Class")
  219. def get_similarities(self, doc):
  220. """Get similarities of the given document or corpus against this index.
  221. Parameters
  222. ----------
  223. doc : {list of (int, number), iterable of list of (int, number)}
  224. Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents.
  225. """
  226. raise NotImplementedError("cannot instantiate Abstract Base Class")
  227. def __getitem__(self, query):
  228. """Get similarities of the given document or corpus against this index.
  229. Uses :meth:`~gensim.interfaces.SimilarityABC.get_similarities` internally.
  230. Notes
  231. -----
  232. Passing an entire corpus as `query` can be more efficient than passing its documents one after another,
  233. because it will issue queries in batches internally.
  234. Parameters
  235. ----------
  236. query : {list of (int, number), iterable of list of (int, number)}
  237. Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents.
  238. Returns
  239. -------
  240. {`scipy.sparse.csr.csr_matrix`, list of (int, float)}
  241. Similarities given document or corpus and objects corpus, depends on `query`.
  242. """
  243. is_corpus, query = utils.is_corpus(query)
  244. if self.normalize:
  245. # self.normalize only works if the input is a plain gensim vector/corpus (as
  246. # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
  247. # as well, but in that case assume tricks are happening and don't normalize
  248. # anything (self.normalize has no effect).
  249. if not matutils.ismatrix(query):
  250. if is_corpus:
  251. query = [matutils.unitvec(v) for v in query]
  252. else:
  253. query = matutils.unitvec(query)
  254. result = self.get_similarities(query)
  255. if self.num_best is None:
  256. return result
  257. # if maintain_sparsity is True, result is scipy sparse. Sort, clip the
  258. # topn and return as a scipy sparse matrix.
  259. if getattr(self, 'maintain_sparsity', False):
  260. return matutils.scipy2scipy_clipped(result, self.num_best)
  261. # if the input query was a corpus (=more documents), compute the top-n
  262. # most similar for each document in turn
  263. if matutils.ismatrix(result):
  264. return [matutils.full2sparse_clipped(v, self.num_best) for v in result]
  265. else:
  266. # otherwise, return top-n of the single input document
  267. return matutils.full2sparse_clipped(result, self.num_best)
  268. def __iter__(self):
  269. """Iterate over all documents, compute similarity of each document against all other documents in the index.
  270. Yields
  271. ------
  272. {`scipy.sparse.csr.csr_matrix`, list of (int, float)}
  273. Similarity of the current document and all documents in the corpus.
  274. """
  275. # turn off query normalization (vectors in the index are assumed to be already normalized)
  276. norm = self.normalize
  277. self.normalize = False
  278. # Try to compute similarities in bigger chunks of documents (not
  279. # one query = a single document after another). The point is, a
  280. # bigger query of N documents is faster than N small queries of one
  281. # document.
  282. #
  283. # After computing similarities of the bigger query in `self[chunk]`,
  284. # yield the resulting similarities one after another, so that it looks
  285. # exactly the same as if they had been computed with many small queries.
  286. try:
  287. chunking = self.chunksize > 1
  288. except AttributeError:
  289. # chunking not supported; fall back to the (slower) mode of 1 query=1 document
  290. chunking = False
  291. if chunking:
  292. # assumes `self.corpus` holds the index as a 2-d numpy array.
  293. # this is true for MatrixSimilarity and SparseMatrixSimilarity, but
  294. # may not be true for other (future) classes..?
  295. for chunk_start in xrange(0, self.index.shape[0], self.chunksize):
  296. # scipy.sparse doesn't allow slicing beyond real size of the matrix
  297. # (unlike numpy). so, clip the end of the chunk explicitly to make
  298. # scipy.sparse happy
  299. chunk_end = min(self.index.shape[0], chunk_start + self.chunksize)
  300. chunk = self.index[chunk_start: chunk_end]
  301. for sim in self[chunk]:
  302. yield sim
  303. else:
  304. for doc in self.index:
  305. yield self[doc]
  306. # restore old normalization value
  307. self.normalize = norm