391 lines
15 KiB
Python
391 lines
15 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
|
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
|
|
|
"""Basic interfaces used across the whole Gensim package.
|
|
|
|
These interfaces are used for building corpora, model transformation and similarity queries.
|
|
|
|
The interfaces are realized as abstract base classes. This means some functionality is already
|
|
provided in the interface itself, and subclasses should inherit from these interfaces
|
|
and implement the missing methods.
|
|
|
|
"""
|
|
|
|
from __future__ import with_statement
|
|
|
|
import logging
|
|
|
|
from gensim import utils, matutils
|
|
from six.moves import xrange
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class CorpusABC(utils.SaveLoad):
|
|
"""Interface for corpus classes from :mod:`gensim.corpora`.
|
|
|
|
Corpus is simply an iterable object, where each iteration step yields one document:
|
|
|
|
>>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class
|
|
>>> from gensim.test.utils import datapath
|
|
>>>
|
|
>>> corpus = MmCorpus(datapath("testcorpus.mm"))
|
|
>>> for doc in corpus:
|
|
... pass # do something with the doc...
|
|
|
|
A document represented in bag-of-word (BoW) format, i.e. list of (attr_id, attr_value),
|
|
like ``[(1, 0.2), (4, 0.6), ...]``.
|
|
|
|
>>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class
|
|
>>> from gensim.test.utils import datapath
|
|
>>>
|
|
>>> corpus = MmCorpus(datapath("testcorpus.mm"))
|
|
>>> doc = next(iter(corpus))
|
|
>>> print(doc)
|
|
[(0, 1.0), (1, 1.0), (2, 1.0)]
|
|
|
|
Remember, that save/load methods save only corpus class (not corpus as data itself),
|
|
for save/load functionality, please use this pattern :
|
|
|
|
>>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class
|
|
>>> from gensim.test.utils import datapath, get_tmpfile
|
|
>>>
|
|
>>> corpus = MmCorpus(datapath("testcorpus.mm"))
|
|
>>> tmp_path = get_tmpfile("temp_corpus.mm")
|
|
>>>
|
|
>>> MmCorpus.serialize(tmp_path, corpus) # serialize corpus to disk in MmCorpus format
|
|
>>> # MmCorpus.save_corpus(tmp_path, corpus) # this variant also possible, but if serialize availbe - call it.
|
|
>>> loaded_corpus = MmCorpus(tmp_path) # load corpus through constructor
|
|
>>> for (doc_1, doc_2) in zip(corpus, loaded_corpus):
|
|
... assert doc_1 == doc_2 # check that corpuses exactly same
|
|
|
|
|
|
See Also
|
|
--------
|
|
:mod:`gensim.corpora`
|
|
Corpuses in different formats
|
|
|
|
"""
|
|
def __iter__(self):
|
|
"""Iterate all over corpus."""
|
|
raise NotImplementedError('cannot instantiate abstract base class')
|
|
|
|
def save(self, *args, **kwargs):
|
|
"""Saves corpus in-memory state.
|
|
|
|
Warnings
|
|
--------
|
|
This save only the "state" of a corpus class, not the corpus data!
|
|
|
|
For saving data use the `serialize` method of the output format you'd like to use
|
|
(e.g. :meth:`gensim.corpora.mmcorpus.MmCorpus.serialize`).
|
|
|
|
"""
|
|
import warnings
|
|
warnings.warn(
|
|
"corpus.save() stores only the (tiny) iteration object in memory; "
|
|
"to serialize the actual corpus content, use e.g. MmCorpus.serialize(corpus)"
|
|
)
|
|
super(CorpusABC, self).save(*args, **kwargs)
|
|
|
|
def __len__(self):
|
|
"""Get the corpus size = the total number of documents in it."""
|
|
raise NotImplementedError("must override __len__() before calling len(corpus)")
|
|
|
|
@staticmethod
|
|
def save_corpus(fname, corpus, id2word=None, metadata=False):
|
|
"""Save `corpus` to disk.
|
|
|
|
Some formats support saving the dictionary (`feature_id -> word` mapping),
|
|
which can be provided by the optional `id2word` parameter.
|
|
|
|
Notes
|
|
-----
|
|
Some corpora also support random access via document indexing, so that the documents on disk
|
|
can be accessed in O(1) time (see the :class:`gensim.corpora.indexedcorpus.IndexedCorpus` base class).
|
|
|
|
In this case, :meth:`~gensim.interfaces.CorpusABC.save_corpus` is automatically called internally by
|
|
:func:`serialize`, which does :meth:`~gensim.interfaces.CorpusABC.save_corpus` plus saves the index
|
|
at the same time.
|
|
|
|
Calling :func:`serialize() is preferred to calling :meth:`gensim.interfaces.CorpusABC.save_corpus`.
|
|
|
|
Parameters
|
|
----------
|
|
fname : str
|
|
Path to output file.
|
|
corpus : iterable of list of (int, number)
|
|
Corpus in BoW format.
|
|
id2word : :class:`~gensim.corpora.Dictionary`, optional
|
|
Dictionary of corpus.
|
|
metadata : bool, optional
|
|
Write additional metadata to a separate too?
|
|
|
|
"""
|
|
raise NotImplementedError('cannot instantiate abstract base class')
|
|
|
|
|
|
class TransformedCorpus(CorpusABC):
|
|
"""Interface for corpora that are the result of an online (streamed) transformation."""
|
|
def __init__(self, obj, corpus, chunksize=None, **kwargs):
|
|
"""
|
|
|
|
Parameters
|
|
----------
|
|
obj : object
|
|
A transformation :class:`~gensim.interfaces.TransformationABC` object that will be applied
|
|
to each document from `corpus` during iteration.
|
|
corpus : iterable of list of (int, number)
|
|
Corpus in bag-of-words format.
|
|
chunksize : int, optional
|
|
If provided, a slightly more effective processing will be performed by grouping documents from `corpus`.
|
|
|
|
"""
|
|
self.obj, self.corpus, self.chunksize = obj, corpus, chunksize
|
|
# add the new parameters like per_word_topics to base class object of LdaModel
|
|
for key, value in kwargs.items():
|
|
setattr(self.obj, key, value)
|
|
self.metadata = False
|
|
|
|
def __len__(self):
|
|
"""Get corpus size."""
|
|
return len(self.corpus)
|
|
|
|
def __iter__(self):
|
|
"""Iterate over the corpus, applying the selected transformation.
|
|
|
|
If `chunksize` was set in the constructor, works in "batch-manner" (more efficient).
|
|
|
|
Yields
|
|
------
|
|
list of (int, number)
|
|
Documents in the sparse Gensim bag-of-words format.
|
|
|
|
"""
|
|
if self.chunksize:
|
|
for chunk in utils.grouper(self.corpus, self.chunksize):
|
|
for transformed in self.obj.__getitem__(chunk, chunksize=None):
|
|
yield transformed
|
|
else:
|
|
for doc in self.corpus:
|
|
yield self.obj[doc]
|
|
|
|
def __getitem__(self, docno):
|
|
"""Transform the document at position `docno` within `corpus` specified in the constructor.
|
|
|
|
Parameters
|
|
----------
|
|
docno : int
|
|
Position of the document to transform. Document offset inside `self.corpus`.
|
|
|
|
Notes
|
|
-----
|
|
`self.corpus` must support random indexing.
|
|
|
|
Returns
|
|
-------
|
|
list of (int, number)
|
|
Transformed document in the sparse Gensim bag-of-words format.
|
|
|
|
Raises
|
|
------
|
|
RuntimeError
|
|
If corpus doesn't support index slicing (`__getitem__` doesn't exists).
|
|
|
|
"""
|
|
if hasattr(self.corpus, '__getitem__'):
|
|
return self.obj[self.corpus[docno]]
|
|
else:
|
|
raise RuntimeError('Type {} does not support slicing.'.format(type(self.corpus)))
|
|
|
|
|
|
class TransformationABC(utils.SaveLoad):
|
|
"""Transformation interface.
|
|
|
|
A 'transformation' is any object which accepts document in BoW format via the `__getitem__` (notation `[]`)
|
|
and returns another sparse document in its stead:
|
|
|
|
>>> from gensim.models import LsiModel
|
|
>>> from gensim.test.utils import common_dictionary, common_corpus
|
|
>>>
|
|
>>> model = LsiModel(common_corpus, id2word=common_dictionary)
|
|
>>> bow_vector = model[common_corpus[0]] # model applied through __getitem__ on one document from corpus.
|
|
>>> bow_corpus = model[common_corpus] # also, we can apply model on the full corpus
|
|
|
|
"""
|
|
def __getitem__(self, vec):
|
|
"""Transform a single document, or a whole corpus, from one vector space into another.
|
|
|
|
Parameters
|
|
----------
|
|
vec : {list of (int, number), iterable of list of (int, number)}
|
|
Document in bag-of-words, or streamed corpus.
|
|
|
|
"""
|
|
raise NotImplementedError('cannot instantiate abstract base class')
|
|
|
|
def _apply(self, corpus, chunksize=None, **kwargs):
|
|
"""Apply the transformation to a whole corpus and get the result as another corpus.
|
|
|
|
Parameters
|
|
----------
|
|
corpus : iterable of list of (int, number)
|
|
Corpus in sparse Gensim bag-of-words format.
|
|
chunksize : int, optional
|
|
If provided, a more effective processing will performed.
|
|
|
|
Returns
|
|
-------
|
|
:class:`~gensim.interfaces.TransformedCorpus`
|
|
Transformed corpus.
|
|
|
|
"""
|
|
return TransformedCorpus(self, corpus, chunksize, **kwargs)
|
|
|
|
|
|
class SimilarityABC(utils.SaveLoad):
|
|
"""Interface for similarity search over a corpus.
|
|
|
|
In all instances, there is a corpus against which we want to perform the similarity search.
|
|
For each similarity search, the input is a document or a corpus, and the output are the similarities
|
|
to individual corpus documents.
|
|
|
|
Examples
|
|
--------
|
|
>>> from gensim.similarities import MatrixSimilarity
|
|
>>> from gensim.test.utils import common_dictionary, common_corpus
|
|
>>>
|
|
>>> index = MatrixSimilarity(common_corpus)
|
|
>>> similarities = index.get_similarities(common_corpus[1]) # get similarities between query and corpus
|
|
|
|
Notes
|
|
-----
|
|
There is also a convenience wrapper, where iterating over `self` yields similarities of each document in the corpus
|
|
against the whole corpus (i.e. the query is each corpus document in turn).
|
|
|
|
See Also
|
|
--------
|
|
:mod:`gensim.similarities`
|
|
Different index implementations of this interface.
|
|
|
|
"""
|
|
def __init__(self, corpus):
|
|
"""
|
|
|
|
Parameters
|
|
----------
|
|
corpus : iterable of list of (int, number)
|
|
Corpus in sparse Gensim bag-of-words format.
|
|
|
|
"""
|
|
raise NotImplementedError("cannot instantiate Abstract Base Class")
|
|
|
|
def get_similarities(self, doc):
|
|
"""Get similarities of the given document or corpus against this index.
|
|
|
|
Parameters
|
|
----------
|
|
doc : {list of (int, number), iterable of list of (int, number)}
|
|
Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents.
|
|
|
|
"""
|
|
raise NotImplementedError("cannot instantiate Abstract Base Class")
|
|
|
|
def __getitem__(self, query):
|
|
"""Get similarities of the given document or corpus against this index.
|
|
|
|
Uses :meth:`~gensim.interfaces.SimilarityABC.get_similarities` internally.
|
|
|
|
Notes
|
|
-----
|
|
Passing an entire corpus as `query` can be more efficient than passing its documents one after another,
|
|
because it will issue queries in batches internally.
|
|
|
|
Parameters
|
|
----------
|
|
query : {list of (int, number), iterable of list of (int, number)}
|
|
Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents.
|
|
|
|
Returns
|
|
-------
|
|
{`scipy.sparse.csr.csr_matrix`, list of (int, float)}
|
|
Similarities given document or corpus and objects corpus, depends on `query`.
|
|
|
|
"""
|
|
is_corpus, query = utils.is_corpus(query)
|
|
if self.normalize:
|
|
# self.normalize only works if the input is a plain gensim vector/corpus (as
|
|
# advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
|
|
# as well, but in that case assume tricks are happening and don't normalize
|
|
# anything (self.normalize has no effect).
|
|
if not matutils.ismatrix(query):
|
|
if is_corpus:
|
|
query = [matutils.unitvec(v) for v in query]
|
|
else:
|
|
query = matutils.unitvec(query)
|
|
result = self.get_similarities(query)
|
|
|
|
if self.num_best is None:
|
|
return result
|
|
|
|
# if maintain_sparsity is True, result is scipy sparse. Sort, clip the
|
|
# topn and return as a scipy sparse matrix.
|
|
if getattr(self, 'maintain_sparsity', False):
|
|
return matutils.scipy2scipy_clipped(result, self.num_best)
|
|
|
|
# if the input query was a corpus (=more documents), compute the top-n
|
|
# most similar for each document in turn
|
|
if matutils.ismatrix(result):
|
|
return [matutils.full2sparse_clipped(v, self.num_best) for v in result]
|
|
else:
|
|
# otherwise, return top-n of the single input document
|
|
return matutils.full2sparse_clipped(result, self.num_best)
|
|
|
|
def __iter__(self):
|
|
"""Iterate over all documents, compute similarity of each document against all other documents in the index.
|
|
|
|
Yields
|
|
------
|
|
{`scipy.sparse.csr.csr_matrix`, list of (int, float)}
|
|
Similarity of the current document and all documents in the corpus.
|
|
|
|
"""
|
|
# turn off query normalization (vectors in the index are assumed to be already normalized)
|
|
norm = self.normalize
|
|
self.normalize = False
|
|
|
|
# Try to compute similarities in bigger chunks of documents (not
|
|
# one query = a single document after another). The point is, a
|
|
# bigger query of N documents is faster than N small queries of one
|
|
# document.
|
|
#
|
|
# After computing similarities of the bigger query in `self[chunk]`,
|
|
# yield the resulting similarities one after another, so that it looks
|
|
# exactly the same as if they had been computed with many small queries.
|
|
try:
|
|
chunking = self.chunksize > 1
|
|
except AttributeError:
|
|
# chunking not supported; fall back to the (slower) mode of 1 query=1 document
|
|
chunking = False
|
|
if chunking:
|
|
# assumes `self.corpus` holds the index as a 2-d numpy array.
|
|
# this is true for MatrixSimilarity and SparseMatrixSimilarity, but
|
|
# may not be true for other (future) classes..?
|
|
for chunk_start in xrange(0, self.index.shape[0], self.chunksize):
|
|
# scipy.sparse doesn't allow slicing beyond real size of the matrix
|
|
# (unlike numpy). so, clip the end of the chunk explicitly to make
|
|
# scipy.sparse happy
|
|
chunk_end = min(self.index.shape[0], chunk_start + self.chunksize)
|
|
chunk = self.index[chunk_start: chunk_end]
|
|
for sim in self[chunk]:
|
|
yield sim
|
|
else:
|
|
for doc in self.index:
|
|
yield self[doc]
|
|
|
|
# restore old normalization value
|
|
self.normalize = norm
|