#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2010 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """Basic interfaces used across the whole Gensim package. These interfaces are used for building corpora, model transformation and similarity queries. The interfaces are realized as abstract base classes. This means some functionality is already provided in the interface itself, and subclasses should inherit from these interfaces and implement the missing methods. """ from __future__ import with_statement import logging from gensim import utils, matutils from six.moves import xrange logger = logging.getLogger(__name__) class CorpusABC(utils.SaveLoad): """Interface for corpus classes from :mod:`gensim.corpora`. Corpus is simply an iterable object, where each iteration step yields one document: >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class >>> from gensim.test.utils import datapath >>> >>> corpus = MmCorpus(datapath("testcorpus.mm")) >>> for doc in corpus: ... pass # do something with the doc... A document represented in bag-of-word (BoW) format, i.e. list of (attr_id, attr_value), like ``[(1, 0.2), (4, 0.6), ...]``. >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class >>> from gensim.test.utils import datapath >>> >>> corpus = MmCorpus(datapath("testcorpus.mm")) >>> doc = next(iter(corpus)) >>> print(doc) [(0, 1.0), (1, 1.0), (2, 1.0)] Remember, that save/load methods save only corpus class (not corpus as data itself), for save/load functionality, please use this pattern : >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class >>> from gensim.test.utils import datapath, get_tmpfile >>> >>> corpus = MmCorpus(datapath("testcorpus.mm")) >>> tmp_path = get_tmpfile("temp_corpus.mm") >>> >>> MmCorpus.serialize(tmp_path, corpus) # serialize corpus to disk in MmCorpus format >>> # MmCorpus.save_corpus(tmp_path, corpus) # this variant also possible, but if serialize availbe - call it. >>> loaded_corpus = MmCorpus(tmp_path) # load corpus through constructor >>> for (doc_1, doc_2) in zip(corpus, loaded_corpus): ... assert doc_1 == doc_2 # check that corpuses exactly same See Also -------- :mod:`gensim.corpora` Corpuses in different formats """ def __iter__(self): """Iterate all over corpus.""" raise NotImplementedError('cannot instantiate abstract base class') def save(self, *args, **kwargs): """Saves corpus in-memory state. Warnings -------- This save only the "state" of a corpus class, not the corpus data! For saving data use the `serialize` method of the output format you'd like to use (e.g. :meth:`gensim.corpora.mmcorpus.MmCorpus.serialize`). """ import warnings warnings.warn( "corpus.save() stores only the (tiny) iteration object in memory; " "to serialize the actual corpus content, use e.g. MmCorpus.serialize(corpus)" ) super(CorpusABC, self).save(*args, **kwargs) def __len__(self): """Get the corpus size = the total number of documents in it.""" raise NotImplementedError("must override __len__() before calling len(corpus)") @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): """Save `corpus` to disk. Some formats support saving the dictionary (`feature_id -> word` mapping), which can be provided by the optional `id2word` parameter. Notes ----- Some corpora also support random access via document indexing, so that the documents on disk can be accessed in O(1) time (see the :class:`gensim.corpora.indexedcorpus.IndexedCorpus` base class). In this case, :meth:`~gensim.interfaces.CorpusABC.save_corpus` is automatically called internally by :func:`serialize`, which does :meth:`~gensim.interfaces.CorpusABC.save_corpus` plus saves the index at the same time. Calling :func:`serialize() is preferred to calling :meth:`gensim.interfaces.CorpusABC.save_corpus`. Parameters ---------- fname : str Path to output file. corpus : iterable of list of (int, number) Corpus in BoW format. id2word : :class:`~gensim.corpora.Dictionary`, optional Dictionary of corpus. metadata : bool, optional Write additional metadata to a separate too? """ raise NotImplementedError('cannot instantiate abstract base class') class TransformedCorpus(CorpusABC): """Interface for corpora that are the result of an online (streamed) transformation.""" def __init__(self, obj, corpus, chunksize=None, **kwargs): """ Parameters ---------- obj : object A transformation :class:`~gensim.interfaces.TransformationABC` object that will be applied to each document from `corpus` during iteration. corpus : iterable of list of (int, number) Corpus in bag-of-words format. chunksize : int, optional If provided, a slightly more effective processing will be performed by grouping documents from `corpus`. """ self.obj, self.corpus, self.chunksize = obj, corpus, chunksize # add the new parameters like per_word_topics to base class object of LdaModel for key, value in kwargs.items(): setattr(self.obj, key, value) self.metadata = False def __len__(self): """Get corpus size.""" return len(self.corpus) def __iter__(self): """Iterate over the corpus, applying the selected transformation. If `chunksize` was set in the constructor, works in "batch-manner" (more efficient). Yields ------ list of (int, number) Documents in the sparse Gensim bag-of-words format. """ if self.chunksize: for chunk in utils.grouper(self.corpus, self.chunksize): for transformed in self.obj.__getitem__(chunk, chunksize=None): yield transformed else: for doc in self.corpus: yield self.obj[doc] def __getitem__(self, docno): """Transform the document at position `docno` within `corpus` specified in the constructor. Parameters ---------- docno : int Position of the document to transform. Document offset inside `self.corpus`. Notes ----- `self.corpus` must support random indexing. Returns ------- list of (int, number) Transformed document in the sparse Gensim bag-of-words format. Raises ------ RuntimeError If corpus doesn't support index slicing (`__getitem__` doesn't exists). """ if hasattr(self.corpus, '__getitem__'): return self.obj[self.corpus[docno]] else: raise RuntimeError('Type {} does not support slicing.'.format(type(self.corpus))) class TransformationABC(utils.SaveLoad): """Transformation interface. A 'transformation' is any object which accepts document in BoW format via the `__getitem__` (notation `[]`) and returns another sparse document in its stead: >>> from gensim.models import LsiModel >>> from gensim.test.utils import common_dictionary, common_corpus >>> >>> model = LsiModel(common_corpus, id2word=common_dictionary) >>> bow_vector = model[common_corpus[0]] # model applied through __getitem__ on one document from corpus. >>> bow_corpus = model[common_corpus] # also, we can apply model on the full corpus """ def __getitem__(self, vec): """Transform a single document, or a whole corpus, from one vector space into another. Parameters ---------- vec : {list of (int, number), iterable of list of (int, number)} Document in bag-of-words, or streamed corpus. """ raise NotImplementedError('cannot instantiate abstract base class') def _apply(self, corpus, chunksize=None, **kwargs): """Apply the transformation to a whole corpus and get the result as another corpus. Parameters ---------- corpus : iterable of list of (int, number) Corpus in sparse Gensim bag-of-words format. chunksize : int, optional If provided, a more effective processing will performed. Returns ------- :class:`~gensim.interfaces.TransformedCorpus` Transformed corpus. """ return TransformedCorpus(self, corpus, chunksize, **kwargs) class SimilarityABC(utils.SaveLoad): """Interface for similarity search over a corpus. In all instances, there is a corpus against which we want to perform the similarity search. For each similarity search, the input is a document or a corpus, and the output are the similarities to individual corpus documents. Examples -------- >>> from gensim.similarities import MatrixSimilarity >>> from gensim.test.utils import common_dictionary, common_corpus >>> >>> index = MatrixSimilarity(common_corpus) >>> similarities = index.get_similarities(common_corpus[1]) # get similarities between query and corpus Notes ----- There is also a convenience wrapper, where iterating over `self` yields similarities of each document in the corpus against the whole corpus (i.e. the query is each corpus document in turn). See Also -------- :mod:`gensim.similarities` Different index implementations of this interface. """ def __init__(self, corpus): """ Parameters ---------- corpus : iterable of list of (int, number) Corpus in sparse Gensim bag-of-words format. """ raise NotImplementedError("cannot instantiate Abstract Base Class") def get_similarities(self, doc): """Get similarities of the given document or corpus against this index. Parameters ---------- doc : {list of (int, number), iterable of list of (int, number)} Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents. """ raise NotImplementedError("cannot instantiate Abstract Base Class") def __getitem__(self, query): """Get similarities of the given document or corpus against this index. Uses :meth:`~gensim.interfaces.SimilarityABC.get_similarities` internally. Notes ----- Passing an entire corpus as `query` can be more efficient than passing its documents one after another, because it will issue queries in batches internally. Parameters ---------- query : {list of (int, number), iterable of list of (int, number)} Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents. Returns ------- {`scipy.sparse.csr.csr_matrix`, list of (int, float)} Similarities given document or corpus and objects corpus, depends on `query`. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if not matutils.ismatrix(query): if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparsity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [matutils.full2sparse_clipped(v, self.num_best) for v in result] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best) def __iter__(self): """Iterate over all documents, compute similarity of each document against all other documents in the index. Yields ------ {`scipy.sparse.csr.csr_matrix`, list of (int, float)} Similarity of the current document and all documents in the corpus. """ # turn off query normalization (vectors in the index are assumed to be already normalized) norm = self.normalize self.normalize = False # Try to compute similarities in bigger chunks of documents (not # one query = a single document after another). The point is, a # bigger query of N documents is faster than N small queries of one # document. # # After computing similarities of the bigger query in `self[chunk]`, # yield the resulting similarities one after another, so that it looks # exactly the same as if they had been computed with many small queries. try: chunking = self.chunksize > 1 except AttributeError: # chunking not supported; fall back to the (slower) mode of 1 query=1 document chunking = False if chunking: # assumes `self.corpus` holds the index as a 2-d numpy array. # this is true for MatrixSimilarity and SparseMatrixSimilarity, but # may not be true for other (future) classes..? for chunk_start in xrange(0, self.index.shape[0], self.chunksize): # scipy.sparse doesn't allow slicing beyond real size of the matrix # (unlike numpy). so, clip the end of the chunk explicitly to make # scipy.sparse happy chunk_end = min(self.index.shape[0], chunk_start + self.chunksize) chunk = self.index[chunk_start: chunk_end] for sim in self[chunk]: yield sim else: for doc in self.index: yield self[doc] # restore old normalization value self.normalize = norm