1207 lines
47 KiB
Python
1207 lines
47 KiB
Python
|
#!/usr/bin/env python
|
|||
|
# -*- coding: utf-8 -*-
|
|||
|
#
|
|||
|
# Copyright (C) 2013 Radim Rehurek <radimrehurek@seznam.cz>
|
|||
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
|||
|
|
|||
|
"""Compute similarities across a collection of documents in the Vector Space Model.
|
|||
|
|
|||
|
The main class is :class:`~gensim.similarities.docsim.Similarity`, which builds an index for a given set of documents.
|
|||
|
|
|||
|
Once the index is built, you can perform efficient queries like "Tell me how similar is this query document to each
|
|||
|
document in the index?". The result is a vector of numbers as large as the size of the initial set of documents,
|
|||
|
that is, one float for each index document. Alternatively, you can also request only the top-N most
|
|||
|
similar index documents to the query.
|
|||
|
|
|||
|
|
|||
|
How It Works
|
|||
|
------------
|
|||
|
The :class:`~gensim.similarities.docsim.Similarity` class splits the index into several smaller sub-indexes ("shards"),
|
|||
|
which are disk-based. If your entire index fits in memory (~one million documents per 1GB of RAM),
|
|||
|
you can also use the :class:`~gensim.similarities.docsim.MatrixSimilarity`
|
|||
|
or :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` classes directly.
|
|||
|
These are more simple but do not scale as well: they keep the entire index in RAM, no sharding. They also do not
|
|||
|
support adding new document to the index dynamically.
|
|||
|
|
|||
|
Once the index has been initialized, you can query for document similarity simply by
|
|||
|
|
|||
|
>>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
|
|||
|
>>>
|
|||
|
>>> index_tmpfile = get_tmpfile("index")
|
|||
|
>>> query = [(1, 2), (6, 1), (7, 2)]
|
|||
|
>>>
|
|||
|
>>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary)) # build the index
|
|||
|
>>> similarities = index[query] # get similarities between the query and all index documents
|
|||
|
|
|||
|
If you have more query documents, you can submit them all at once, in a batch
|
|||
|
|
|||
|
>>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
|
|||
|
>>>
|
|||
|
>>> index_tmpfile = get_tmpfile("index")
|
|||
|
>>> batch_of_documents = common_corpus[:] # only as example
|
|||
|
>>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary)) # build the index
|
|||
|
>>>
|
|||
|
>>> for similarities in index[batch_of_documents]: # the batch is simply an iterable of documents, aka gensim corpus.
|
|||
|
... pass
|
|||
|
|
|||
|
The benefit of this batch (aka "chunked") querying is a much better performance.
|
|||
|
To see the speed-up on your machine, run ``python -m gensim.test.simspeed``
|
|||
|
(compare to my results `here <http://groups.google.com/group/gensim/msg/4f6f171a869e4fca?>`_).
|
|||
|
|
|||
|
There is also a special syntax for when you need similarity of documents in the index
|
|||
|
to the index itself (i.e. queries = the indexed documents themselves). This special syntax
|
|||
|
uses the faster, batch queries internally and **is ideal for all-vs-all pairwise similarities**:
|
|||
|
|
|||
|
>>> from gensim.test.utils import common_corpus, common_dictionary, get_tmpfile
|
|||
|
>>>
|
|||
|
>>> index_tmpfile = get_tmpfile("index")
|
|||
|
>>> index = Similarity(index_tmpfile, common_corpus, num_features=len(common_dictionary)) # build the index
|
|||
|
>>>
|
|||
|
>>> for similarities in index: # yield similarities of the 1st indexed document, then 2nd...
|
|||
|
... pass
|
|||
|
|
|||
|
"""
|
|||
|
|
|||
|
import logging
|
|||
|
import itertools
|
|||
|
import os
|
|||
|
import heapq
|
|||
|
|
|||
|
import numpy
|
|||
|
import scipy.sparse
|
|||
|
|
|||
|
from gensim import interfaces, utils, matutils
|
|||
|
from six.moves import map as imap, xrange, zip as izip
|
|||
|
|
|||
|
|
|||
|
logger = logging.getLogger(__name__)
|
|||
|
|
|||
|
PARALLEL_SHARDS = False
|
|||
|
try:
|
|||
|
import multiprocessing
|
|||
|
# by default, don't parallelize queries. uncomment the following line if you want that.
|
|||
|
# PARALLEL_SHARDS = multiprocessing.cpu_count() # use #parallel processes = #CPus
|
|||
|
except ImportError:
|
|||
|
pass
|
|||
|
|
|||
|
|
|||
|
class Shard(utils.SaveLoad):
|
|||
|
"""A proxy that represents a single shard instance within :class:`~gensim.similarity.docsim.Similarity` index.
|
|||
|
|
|||
|
Basically just wraps :class:`~gensim.similarities.docsim.MatrixSimilarity`,
|
|||
|
:class:`~gensim.similarities.docsim.SparseMatrixSimilarity`, etc, so that it mmaps from disk on request (query).
|
|||
|
|
|||
|
"""
|
|||
|
def __init__(self, fname, index):
|
|||
|
"""
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
fname : str
|
|||
|
Path to top-level directory (file) to traverse for corpus documents.
|
|||
|
index : :class:`~gensim.interfaces.SimilarityABC`
|
|||
|
Index object.
|
|||
|
|
|||
|
"""
|
|||
|
self.dirname, self.fname = os.path.split(fname)
|
|||
|
self.length = len(index)
|
|||
|
self.cls = index.__class__
|
|||
|
logger.info("saving index shard to %s", self.fullname())
|
|||
|
index.save(self.fullname())
|
|||
|
self.index = self.get_index()
|
|||
|
|
|||
|
def fullname(self):
|
|||
|
"""Get full path to shard file.
|
|||
|
|
|||
|
Return
|
|||
|
------
|
|||
|
str
|
|||
|
Path to shard instance.
|
|||
|
|
|||
|
"""
|
|||
|
return os.path.join(self.dirname, self.fname)
|
|||
|
|
|||
|
def __len__(self):
|
|||
|
"""Get length."""
|
|||
|
return self.length
|
|||
|
|
|||
|
def __getstate__(self):
|
|||
|
"""Special handler for pickle.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
dict
|
|||
|
Object that contains state of current instance without `index`.
|
|||
|
|
|||
|
"""
|
|||
|
result = self.__dict__.copy()
|
|||
|
# (S)MS objects must be loaded via load() because of mmap (simple pickle.load won't do)
|
|||
|
if 'index' in result:
|
|||
|
del result['index']
|
|||
|
return result
|
|||
|
|
|||
|
def __str__(self):
|
|||
|
return "%s Shard(%i documents in %s)" % (self.cls.__name__, len(self), self.fullname())
|
|||
|
|
|||
|
def get_index(self):
|
|||
|
"""Load & get index.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
:class:`~gensim.interfaces.SimilarityABC`
|
|||
|
Index instance.
|
|||
|
|
|||
|
"""
|
|||
|
if not hasattr(self, 'index'):
|
|||
|
logger.debug("mmaping index from %s", self.fullname())
|
|||
|
self.index = self.cls.load(self.fullname(), mmap='r')
|
|||
|
return self.index
|
|||
|
|
|||
|
def get_document_id(self, pos):
|
|||
|
"""Get index vector at position `pos`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
pos : int
|
|||
|
Vector position.
|
|||
|
|
|||
|
Return
|
|||
|
------
|
|||
|
{:class:`scipy.sparse.csr_matrix`, :class:`numpy.ndarray`}
|
|||
|
Index vector. Type depends on underlying index.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
The vector is of the same type as the underlying index (ie., dense for
|
|||
|
:class:`~gensim.similarities.docsim.MatrixSimilarity`
|
|||
|
and scipy.sparse for :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`.
|
|||
|
|
|||
|
"""
|
|||
|
assert 0 <= pos < len(self), "requested position out of range"
|
|||
|
return self.get_index().index[pos]
|
|||
|
|
|||
|
def __getitem__(self, query):
|
|||
|
"""Get similarities of document (or corpus) `query` to all documents in the corpus.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
query : {iterable of list of (int, number) , list of (int, number))}
|
|||
|
Document or corpus.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
:class:`numpy.ndarray`
|
|||
|
Similarities of document/corpus if index is :class:`~gensim.similarities.docsim.MatrixSimilarity` **or**
|
|||
|
:class:`scipy.sparse.csr_matrix`
|
|||
|
for case if index is :class:`~gensim.similarities.docsim.SparseMatrixSimilarity`.
|
|||
|
|
|||
|
"""
|
|||
|
index = self.get_index()
|
|||
|
try:
|
|||
|
index.num_best = self.num_best
|
|||
|
index.normalize = self.normalize
|
|||
|
except Exception:
|
|||
|
raise ValueError("num_best and normalize have to be set before querying a proxy Shard object")
|
|||
|
return index[query]
|
|||
|
|
|||
|
|
|||
|
def query_shard(args):
|
|||
|
"""Helper for request query from shard, same as shard[query].
|
|||
|
|
|||
|
Parameters
|
|||
|
---------
|
|||
|
args : (list of (int, number), :class:`~gensim.interfaces.SimilarityABC`)
|
|||
|
Query and Shard instances
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
:class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix`
|
|||
|
Similarities of the query against documents indexed in this shard.
|
|||
|
|
|||
|
"""
|
|||
|
query, shard = args # simulate starmap (not part of multiprocessing in older Pythons)
|
|||
|
logger.debug("querying shard %s num_best=%s in process %s", shard, shard.num_best, os.getpid())
|
|||
|
result = shard[query]
|
|||
|
logger.debug("finished querying shard %s in process %s", shard, os.getpid())
|
|||
|
return result
|
|||
|
|
|||
|
|
|||
|
class Similarity(interfaces.SimilarityABC):
|
|||
|
"""Compute cosine similarity of a dynamic query against a corpus of documents ('the index').
|
|||
|
|
|||
|
The index supports adding new documents dynamically.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
Scalability is achieved by sharding the index into smaller pieces, each of which fits into core memory
|
|||
|
The shards themselves are simply stored as files to disk and mmap'ed back as needed.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from gensim.corpora.textcorpus import TextCorpus
|
|||
|
>>> from gensim.test.utils import datapath, get_tmpfile
|
|||
|
>>> from gensim.similarities import Similarity
|
|||
|
>>>
|
|||
|
>>> corpus = TextCorpus(datapath('testcorpus.mm'))
|
|||
|
>>> index_temp = get_tmpfile("index")
|
|||
|
>>> index = Similarity(index_temp, corpus, num_features=400) # create index
|
|||
|
>>>
|
|||
|
>>> query = next(iter(corpus))
|
|||
|
>>> result = index[query] # search similar to `query` in index
|
|||
|
>>>
|
|||
|
>>> for sims in index[corpus]: # if you have more query documents, you can submit them all at once, in a batch
|
|||
|
... pass
|
|||
|
>>>
|
|||
|
>>> # There is also a special syntax for when you need similarity of documents in the index
|
|||
|
>>> # to the index itself (i.e. queries=indexed documents themselves). This special syntax
|
|||
|
>>> # uses the faster, batch queries internally and **is ideal for all-vs-all pairwise similarities**:
|
|||
|
>>> for similarities in index: # yield similarities of the 1st indexed document, then 2nd...
|
|||
|
... pass
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
:class:`~gensim.similarities.docsim.MatrixSimilarity`
|
|||
|
Index similarity (dense with cosine distance).
|
|||
|
:class:`~gensim.similarities.docsim.SparseMatrixSimilarity`
|
|||
|
Index similarity (sparse with cosine distance).
|
|||
|
:class:`~gensim.similarities.docsim.SoftCosineSimilarity`
|
|||
|
Index similarity (with soft-cosine distance).
|
|||
|
:class:`~gensim.similarities.docsim.WmdSimilarity`
|
|||
|
Index similarity (with word-mover distance).
|
|||
|
|
|||
|
"""
|
|||
|
|
|||
|
def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize=256, shardsize=32768, norm='l2'):
|
|||
|
"""
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
output_prefix : str
|
|||
|
Prefix for shard filename. If None, a random filename in temp will be used.
|
|||
|
corpus : iterable of list of (int, number)
|
|||
|
Corpus in streamed Gensim bag-of-words format.
|
|||
|
num_features : int
|
|||
|
Size of the dictionary (number of features).
|
|||
|
num_best : int, optional
|
|||
|
If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
|
|||
|
Otherwise, return a full vector with one float for every document in the index.
|
|||
|
chunksize : int, optional
|
|||
|
Size of query chunks. Used internally when the query is an entire corpus.
|
|||
|
shardsize : int, optional
|
|||
|
Maximum shard size, in documents. Choose a value so that a `shardsize x chunksize` matrix of floats fits
|
|||
|
comfortably into your RAM.
|
|||
|
norm : {'l1', 'l2'}, optional
|
|||
|
Normalization to use.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
Documents are split (internally, transparently) into shards of `shardsize` documents each, and each shard
|
|||
|
converted to a matrix, for faster BLAS calls. Each shard is stored to disk under `output_prefix.shard_number`.
|
|||
|
|
|||
|
If you don't specify an output prefix, a random filename in temp will be used.
|
|||
|
|
|||
|
If your entire index fits in memory (~1 million documents per 1GB of RAM), you can also use the
|
|||
|
:class:`~gensim.similarities.docsim.MatrixSimilarity` or
|
|||
|
:class:`~gensim.similarities.docsim.SparseMatrixSimilarity` classes directly.
|
|||
|
These are more simple but do not scale as well (they keep the entire index in RAM, no sharding).
|
|||
|
They also do not support adding new document dynamically.
|
|||
|
|
|||
|
"""
|
|||
|
if output_prefix is None:
|
|||
|
# undocumented feature: set output_prefix=None to create the server in temp
|
|||
|
self.output_prefix = utils.randfname(prefix='simserver')
|
|||
|
else:
|
|||
|
self.output_prefix = output_prefix
|
|||
|
logger.info("starting similarity index under %s", self.output_prefix)
|
|||
|
self.num_features = num_features
|
|||
|
self.num_best = num_best
|
|||
|
self.norm = norm
|
|||
|
self.chunksize = int(chunksize)
|
|||
|
self.shardsize = shardsize
|
|||
|
self.shards = []
|
|||
|
self.fresh_docs, self.fresh_nnz = [], 0
|
|||
|
|
|||
|
if corpus is not None:
|
|||
|
self.add_documents(corpus)
|
|||
|
|
|||
|
def __len__(self):
|
|||
|
"""Get length of index."""
|
|||
|
return len(self.fresh_docs) + sum([len(shard) for shard in self.shards])
|
|||
|
|
|||
|
def __str__(self):
|
|||
|
return "Similarity index with %i documents in %i shards (stored under %s)" % (
|
|||
|
len(self), len(self.shards), self.output_prefix
|
|||
|
)
|
|||
|
|
|||
|
def add_documents(self, corpus):
|
|||
|
"""Extend the index with new documents.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
corpus : iterable of list of (int, number)
|
|||
|
Corpus in BoW format.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
Internally, documents are buffered and then spilled to disk when there's `self.shardsize` of them
|
|||
|
(or when a query is issued).
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from gensim.corpora.textcorpus import TextCorpus
|
|||
|
>>> from gensim.test.utils import datapath, get_tmpfile
|
|||
|
>>> from gensim.similarities import Similarity
|
|||
|
>>>
|
|||
|
>>> corpus = TextCorpus(datapath('testcorpus.mm'))
|
|||
|
>>> index_temp = get_tmpfile("index")
|
|||
|
>>> index = Similarity(index_temp, corpus, num_features=400) # create index
|
|||
|
>>>
|
|||
|
>>> one_more_corpus = TextCorpus(datapath('testcorpus.txt'))
|
|||
|
>>> index.add_documents(one_more_corpus) # add more documents in corpus
|
|||
|
|
|||
|
"""
|
|||
|
min_ratio = 1.0 # 0.5 to only reopen shards that are <50% complete
|
|||
|
if self.shards and len(self.shards[-1]) < min_ratio * self.shardsize:
|
|||
|
# The last shard was incomplete (<; load it back and add the documents there, don't start a new shard
|
|||
|
self.reopen_shard()
|
|||
|
for doc in corpus:
|
|||
|
if isinstance(doc, numpy.ndarray):
|
|||
|
doclen = len(doc)
|
|||
|
elif scipy.sparse.issparse(doc):
|
|||
|
doclen = doc.nnz
|
|||
|
else:
|
|||
|
doclen = len(doc)
|
|||
|
if doclen < 0.3 * self.num_features:
|
|||
|
doc = matutils.unitvec(matutils.corpus2csc([doc], self.num_features).T, self.norm)
|
|||
|
else:
|
|||
|
doc = matutils.unitvec(matutils.sparse2full(doc, self.num_features), self.norm)
|
|||
|
self.fresh_docs.append(doc)
|
|||
|
self.fresh_nnz += doclen
|
|||
|
if len(self.fresh_docs) >= self.shardsize:
|
|||
|
self.close_shard()
|
|||
|
if len(self.fresh_docs) % 10000 == 0:
|
|||
|
logger.info("PROGRESS: fresh_shard size=%i", len(self.fresh_docs))
|
|||
|
|
|||
|
def shardid2filename(self, shardid):
|
|||
|
"""Get shard file by `shardid`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
shardid : int
|
|||
|
Shard index.
|
|||
|
|
|||
|
Return
|
|||
|
------
|
|||
|
str
|
|||
|
Path to shard file.
|
|||
|
|
|||
|
"""
|
|||
|
if self.output_prefix.endswith('.'):
|
|||
|
return "%s%s" % (self.output_prefix, shardid)
|
|||
|
else:
|
|||
|
return "%s.%s" % (self.output_prefix, shardid)
|
|||
|
|
|||
|
def close_shard(self):
|
|||
|
"""Force the latest shard to close (be converted to a matrix and stored to disk).
|
|||
|
Do nothing if no new documents added since last call.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
The shard is closed even if it is not full yet (its size is smaller than `self.shardsize`).
|
|||
|
If documents are added later via :meth:`~gensim.similarities.docsim.MatrixSimilarity.add_documents`
|
|||
|
this incomplete shard will be loaded again and completed.
|
|||
|
|
|||
|
"""
|
|||
|
if not self.fresh_docs:
|
|||
|
return
|
|||
|
shardid = len(self.shards)
|
|||
|
# consider the shard sparse if its density is < 30%
|
|||
|
issparse = 0.3 > 1.0 * self.fresh_nnz / (len(self.fresh_docs) * self.num_features)
|
|||
|
if issparse:
|
|||
|
index = SparseMatrixSimilarity(
|
|||
|
self.fresh_docs, num_terms=self.num_features, num_docs=len(self.fresh_docs), num_nnz=self.fresh_nnz
|
|||
|
)
|
|||
|
else:
|
|||
|
index = MatrixSimilarity(self.fresh_docs, num_features=self.num_features)
|
|||
|
logger.info("creating %s shard #%s", 'sparse' if issparse else 'dense', shardid)
|
|||
|
shard = Shard(self.shardid2filename(shardid), index)
|
|||
|
shard.num_best = self.num_best
|
|||
|
shard.num_nnz = self.fresh_nnz
|
|||
|
self.shards.append(shard)
|
|||
|
self.fresh_docs, self.fresh_nnz = [], 0
|
|||
|
|
|||
|
def reopen_shard(self):
|
|||
|
"""Reopen an incomplete shard."""
|
|||
|
assert self.shards
|
|||
|
if self.fresh_docs:
|
|||
|
raise ValueError("cannot reopen a shard with fresh documents in index")
|
|||
|
last_shard = self.shards[-1]
|
|||
|
last_index = last_shard.get_index()
|
|||
|
logger.info("reopening an incomplete shard of %i documents", len(last_shard))
|
|||
|
|
|||
|
self.fresh_docs = list(last_index.index)
|
|||
|
self.fresh_nnz = last_shard.num_nnz
|
|||
|
del self.shards[-1] # remove the shard from index, *but its file on disk is not deleted*
|
|||
|
logger.debug("reopen complete")
|
|||
|
|
|||
|
def query_shards(self, query):
|
|||
|
"""Apply shard[query] to each shard in `self.shards`. Used internally.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
query : {iterable of list of (int, number) , list of (int, number))}
|
|||
|
Document in BoW format or corpus of documents.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
(None, list of individual shard query results)
|
|||
|
Query results.
|
|||
|
|
|||
|
"""
|
|||
|
args = zip([query] * len(self.shards), self.shards)
|
|||
|
if PARALLEL_SHARDS and PARALLEL_SHARDS > 1:
|
|||
|
logger.debug("spawning %i query processes", PARALLEL_SHARDS)
|
|||
|
pool = multiprocessing.Pool(PARALLEL_SHARDS)
|
|||
|
result = pool.imap(query_shard, args, chunksize=1 + len(list(args)) / PARALLEL_SHARDS)
|
|||
|
else:
|
|||
|
# serial processing, one shard after another
|
|||
|
pool = None
|
|||
|
result = imap(query_shard, args)
|
|||
|
return pool, result
|
|||
|
|
|||
|
def __getitem__(self, query):
|
|||
|
"""Get similarities of the document (or corpus) `query` to all documents in the corpus.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
query : {iterable of list of (int, number) , list of (int, number))}
|
|||
|
A single document in bag-of-words format, or a corpus (iterable) of such documents.
|
|||
|
|
|||
|
Return
|
|||
|
------
|
|||
|
:class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix`
|
|||
|
Similarities of the query against this index.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
If `query` is a corpus (iterable of documents), return a matrix of similarities of
|
|||
|
all query documents vs. all corpus document. This batch query is more efficient than computing the similarities
|
|||
|
one document after another.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from gensim.corpora.textcorpus import TextCorpus
|
|||
|
>>> from gensim.test.utils import datapath
|
|||
|
>>> from gensim.similarities import Similarity
|
|||
|
>>> import gensim.downloader as api
|
|||
|
>>>
|
|||
|
>>> corpus = TextCorpus(datapath('testcorpus.txt'))
|
|||
|
>>> index = Similarity('temp', corpus, num_features=400)
|
|||
|
>>> result = index[corpus] # pairwise similarities of each document against each document
|
|||
|
|
|||
|
"""
|
|||
|
self.close_shard() # no-op if no documents added to index since last query
|
|||
|
|
|||
|
# reset num_best and normalize parameters, in case they were changed dynamically
|
|||
|
for shard in self.shards:
|
|||
|
shard.num_best = self.num_best
|
|||
|
shard.normalize = self.norm
|
|||
|
|
|||
|
# there are 4 distinct code paths, depending on whether input `query` is
|
|||
|
# a corpus (or numpy/scipy matrix) or a single document, and whether the
|
|||
|
# similarity result should be a full array or only num_best most similar
|
|||
|
# documents.
|
|||
|
pool, shard_results = self.query_shards(query)
|
|||
|
if self.num_best is None:
|
|||
|
# user asked for all documents => just stack the sub-results into a single matrix
|
|||
|
# (works for both corpus / single doc query)
|
|||
|
result = numpy.hstack(shard_results)
|
|||
|
else:
|
|||
|
# the following uses a lot of lazy evaluation and (optionally) parallel
|
|||
|
# processing, to improve query latency and minimize memory footprint.
|
|||
|
offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards])
|
|||
|
|
|||
|
def convert(shard_no, doc):
|
|||
|
return [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc]
|
|||
|
|
|||
|
is_corpus, query = utils.is_corpus(query)
|
|||
|
is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1
|
|||
|
if not is_corpus:
|
|||
|
# user asked for num_best most similar and query is a single doc
|
|||
|
results = (convert(shard_no, result) for shard_no, result in enumerate(shard_results))
|
|||
|
result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1])
|
|||
|
else:
|
|||
|
# the trickiest combination: returning num_best results when query was a corpus
|
|||
|
results = []
|
|||
|
for shard_no, result in enumerate(shard_results):
|
|||
|
shard_result = [convert(shard_no, doc) for doc in result]
|
|||
|
results.append(shard_result)
|
|||
|
result = []
|
|||
|
for parts in izip(*results):
|
|||
|
merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: item[1])
|
|||
|
result.append(merged)
|
|||
|
if pool:
|
|||
|
# gc doesn't seem to collect the Pools, eventually leading to
|
|||
|
# "IOError 24: too many open files". so let's terminate it manually.
|
|||
|
pool.terminate()
|
|||
|
|
|||
|
return result
|
|||
|
|
|||
|
def vector_by_id(self, docpos):
|
|||
|
"""Get the indexed vector corresponding to the document at position `docpos`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
docpos : int
|
|||
|
Document position
|
|||
|
|
|||
|
Return
|
|||
|
------
|
|||
|
:class:`scipy.sparse.csr_matrix`
|
|||
|
Indexed vector.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from gensim.corpora.textcorpus import TextCorpus
|
|||
|
>>> from gensim.test.utils import datapath
|
|||
|
>>> from gensim.similarities import Similarity
|
|||
|
>>> import gensim.downloader as api
|
|||
|
>>>
|
|||
|
>>> # Create index:
|
|||
|
>>> corpus = TextCorpus(datapath('testcorpus.txt'))
|
|||
|
>>> index = Similarity('temp', corpus, num_features=400)
|
|||
|
>>> vector = index.vector_by_id(1)
|
|||
|
|
|||
|
"""
|
|||
|
self.close_shard() # no-op if no documents added to index since last query
|
|||
|
pos = 0
|
|||
|
for shard in self.shards:
|
|||
|
pos += len(shard)
|
|||
|
if docpos < pos:
|
|||
|
break
|
|||
|
if not self.shards or docpos < 0 or docpos >= pos:
|
|||
|
raise ValueError("invalid document position: %s (must be 0 <= x < %s)" % (docpos, len(self)))
|
|||
|
result = shard.get_document_id(docpos - pos + len(shard))
|
|||
|
return result
|
|||
|
|
|||
|
def similarity_by_id(self, docpos):
|
|||
|
"""Get similarity of a document specified by its index position `docpos`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
docpos : int
|
|||
|
Document position in the index.
|
|||
|
|
|||
|
Return
|
|||
|
------
|
|||
|
:class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix`
|
|||
|
Similarities of the given document against this index.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from gensim.corpora.textcorpus import TextCorpus
|
|||
|
>>> from gensim.test.utils import datapath
|
|||
|
>>> from gensim.similarities import Similarity
|
|||
|
>>>
|
|||
|
>>> corpus = TextCorpus(datapath('testcorpus.txt'))
|
|||
|
>>> index = Similarity('temp', corpus, num_features=400)
|
|||
|
>>> similarities = index.similarity_by_id(1)
|
|||
|
|
|||
|
"""
|
|||
|
query = self.vector_by_id(docpos)
|
|||
|
norm, self.norm = self.norm, False
|
|||
|
result = self[query]
|
|||
|
self.norm = norm
|
|||
|
return result
|
|||
|
|
|||
|
def __iter__(self):
|
|||
|
"""For each index document in index, compute cosine similarity against all other documents in the index.
|
|||
|
Uses :meth:`~gensim.similarities.docsim.Similarity.iter_chunks` internally.
|
|||
|
|
|||
|
Yields
|
|||
|
------
|
|||
|
:class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix`
|
|||
|
Similarities of each document in turn against the index.
|
|||
|
|
|||
|
"""
|
|||
|
# turn off query normalization (vectors in the index are already normalized, save some CPU)
|
|||
|
norm, self.norm = self.norm, False
|
|||
|
|
|||
|
for chunk in self.iter_chunks():
|
|||
|
if chunk.shape[0] > 1:
|
|||
|
for sim in self[chunk]:
|
|||
|
yield sim
|
|||
|
else:
|
|||
|
yield self[chunk]
|
|||
|
|
|||
|
self.norm = norm # restore normalization
|
|||
|
|
|||
|
def iter_chunks(self, chunksize=None):
|
|||
|
"""Iteratively yield the index as chunks of document vectors, each of size <= chunksize.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
chunksize : int, optional
|
|||
|
Size of chunk,, if None - `self.chunksize` will be used.
|
|||
|
|
|||
|
Yields
|
|||
|
------
|
|||
|
:class:`numpy.ndarray` or :class:`scipy.sparse.csr_matrix`
|
|||
|
Chunks of the index as 2D arrays. The arrays are either dense or sparse, depending on
|
|||
|
whether the shard was storing dense or sparse vectors.
|
|||
|
|
|||
|
"""
|
|||
|
self.close_shard()
|
|||
|
|
|||
|
if chunksize is None:
|
|||
|
# if not explicitly specified, use the chunksize from the constructor
|
|||
|
chunksize = self.chunksize
|
|||
|
|
|||
|
for shard in self.shards:
|
|||
|
query = shard.get_index().index
|
|||
|
for chunk_start in xrange(0, query.shape[0], chunksize):
|
|||
|
# scipy.sparse doesn't allow slicing beyond real size of the matrix
|
|||
|
# (unlike numpy). so, clip the end of the chunk explicitly to make
|
|||
|
# scipy.sparse happy
|
|||
|
chunk_end = min(query.shape[0], chunk_start + chunksize)
|
|||
|
chunk = query[chunk_start: chunk_end] # create a view
|
|||
|
yield chunk
|
|||
|
|
|||
|
def check_moved(self):
|
|||
|
"""Update shard locations, for case where the server prefix location changed on the filesystem."""
|
|||
|
dirname = os.path.dirname(self.output_prefix)
|
|||
|
for shard in self.shards:
|
|||
|
shard.dirname = dirname
|
|||
|
|
|||
|
def save(self, fname=None, *args, **kwargs):
|
|||
|
"""Save the index object via pickling under `fname`. See also :meth:`~gensim.docsim.Similarity.load()`.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
fname : str, optional
|
|||
|
Path for save index, if not provided - will be saved to `self.output_prefix`.
|
|||
|
*args : object
|
|||
|
Arguments, see :meth:`gensim.utils.SaveLoad.save`.
|
|||
|
**kwargs : object
|
|||
|
Keyword arguments, see :meth:`gensim.utils.SaveLoad.save`.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
Will call :meth:`~gensim.similarities.Similarity.close_shard` internally to spill
|
|||
|
any unfinished shards to disk first.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from gensim.corpora.textcorpus import TextCorpus
|
|||
|
>>> from gensim.test.utils import datapath, get_tmpfile
|
|||
|
>>> from gensim.similarities import Similarity
|
|||
|
>>>
|
|||
|
>>> temp_fname = get_tmpfile("index")
|
|||
|
>>> output_fname = get_tmpfile("saved_index")
|
|||
|
>>>
|
|||
|
>>> corpus = TextCorpus(datapath('testcorpus.txt'))
|
|||
|
>>> index = Similarity(output_fname, corpus, num_features=400)
|
|||
|
>>>
|
|||
|
>>> index.save(output_fname)
|
|||
|
>>> loaded_index = index.load(output_fname)
|
|||
|
|
|||
|
"""
|
|||
|
self.close_shard()
|
|||
|
if fname is None:
|
|||
|
fname = self.output_prefix
|
|||
|
super(Similarity, self).save(fname, *args, **kwargs)
|
|||
|
|
|||
|
def destroy(self):
|
|||
|
"""Delete all files under self.output_prefix Index is not usable anymore after calling this method."""
|
|||
|
import glob
|
|||
|
for fname in glob.glob(self.output_prefix + '*'):
|
|||
|
logger.info("deleting %s", fname)
|
|||
|
os.remove(fname)
|
|||
|
|
|||
|
|
|||
|
class MatrixSimilarity(interfaces.SimilarityABC):
|
|||
|
"""Compute cosine similarity against a corpus of documents by storing the index matrix in memory.
|
|||
|
|
|||
|
Unless the entire matrix fits into main memory, use :class:`~gensim.similarities.docsim.Similarity` instead.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from gensim.test.utils import common_corpus, common_dictionary
|
|||
|
>>> from gensim.similarities import MatrixSimilarity
|
|||
|
>>>
|
|||
|
>>> query = [(1, 2), (5, 4)]
|
|||
|
>>> index = MatrixSimilarity(common_corpus, num_features=len(common_dictionary))
|
|||
|
>>> sims = index[query]
|
|||
|
|
|||
|
"""
|
|||
|
def __init__(self, corpus, num_best=None, dtype=numpy.float32, num_features=None, chunksize=256, corpus_len=None):
|
|||
|
"""
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
corpus : iterable of list of (int, number)
|
|||
|
Corpus in streamed Gensim bag-of-words format.
|
|||
|
num_best : int, optional
|
|||
|
If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
|
|||
|
Otherwise, return a full vector with one float for every document in the index.
|
|||
|
num_features : int
|
|||
|
Size of the dictionary (number of features).
|
|||
|
corpus_len : int, optional
|
|||
|
Number of documents in `corpus`. If not specified, will scan the corpus to determine the matrix size.
|
|||
|
chunksize : int, optional
|
|||
|
Size of query chunks. Used internally when the query is an entire corpus.
|
|||
|
dtype : numpy.dtype, optional
|
|||
|
Datatype to store the internal matrix in.
|
|||
|
|
|||
|
"""
|
|||
|
if num_features is None:
|
|||
|
logger.warning(
|
|||
|
"scanning corpus to determine the number of features (consider setting `num_features` explicitly)"
|
|||
|
)
|
|||
|
num_features = 1 + utils.get_max_id(corpus)
|
|||
|
|
|||
|
self.num_features = num_features
|
|||
|
self.num_best = num_best
|
|||
|
self.normalize = True
|
|||
|
self.chunksize = chunksize
|
|||
|
if corpus_len is None:
|
|||
|
corpus_len = len(corpus)
|
|||
|
|
|||
|
if corpus is not None:
|
|||
|
if self.num_features <= 0:
|
|||
|
raise ValueError(
|
|||
|
"cannot index a corpus with zero features (you must specify either `num_features` "
|
|||
|
"or a non-empty corpus in the constructor)"
|
|||
|
)
|
|||
|
logger.info("creating matrix with %i documents and %i features", corpus_len, num_features)
|
|||
|
self.index = numpy.empty(shape=(corpus_len, num_features), dtype=dtype)
|
|||
|
# iterate over corpus, populating the numpy index matrix with (normalized)
|
|||
|
# document vectors
|
|||
|
for docno, vector in enumerate(corpus):
|
|||
|
if docno % 1000 == 0:
|
|||
|
logger.debug("PROGRESS: at document #%i/%i", docno, corpus_len)
|
|||
|
# individual documents in fact may be in numpy.scipy.sparse format as well.
|
|||
|
# it's not documented because other it's not fully supported throughout.
|
|||
|
# the user better know what he's doing (no normalization, must
|
|||
|
# explicitly supply num_features etc).
|
|||
|
if isinstance(vector, numpy.ndarray):
|
|||
|
pass
|
|||
|
elif scipy.sparse.issparse(vector):
|
|||
|
vector = vector.toarray().flatten()
|
|||
|
else:
|
|||
|
vector = matutils.unitvec(matutils.sparse2full(vector, num_features))
|
|||
|
self.index[docno] = vector
|
|||
|
|
|||
|
def __len__(self):
|
|||
|
return self.index.shape[0]
|
|||
|
|
|||
|
def get_similarities(self, query):
|
|||
|
"""Get similarity between `query` and this index.
|
|||
|
|
|||
|
Warnings
|
|||
|
--------
|
|||
|
Do not use this function directly, use the :class:`~gensim.similarities.docsim.MatrixSimilarity.__getitem__`
|
|||
|
instead.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`
|
|||
|
Document or collection of documents.
|
|||
|
|
|||
|
Return
|
|||
|
------
|
|||
|
:class:`numpy.ndarray`
|
|||
|
Similarity matrix.
|
|||
|
|
|||
|
"""
|
|||
|
is_corpus, query = utils.is_corpus(query)
|
|||
|
if is_corpus:
|
|||
|
query = numpy.asarray(
|
|||
|
[matutils.sparse2full(vec, self.num_features) for vec in query],
|
|||
|
dtype=self.index.dtype
|
|||
|
)
|
|||
|
else:
|
|||
|
if scipy.sparse.issparse(query):
|
|||
|
query = query.toarray() # convert sparse to dense
|
|||
|
elif isinstance(query, numpy.ndarray):
|
|||
|
pass
|
|||
|
else:
|
|||
|
# default case: query is a single vector in sparse gensim format
|
|||
|
query = matutils.sparse2full(query, self.num_features)
|
|||
|
query = numpy.asarray(query, dtype=self.index.dtype)
|
|||
|
|
|||
|
# do a little transposition dance to stop numpy from making a copy of
|
|||
|
# self.index internally in numpy.dot (very slow).
|
|||
|
result = numpy.dot(self.index, query.T).T # return #queries x #index
|
|||
|
return result # XXX: removed casting the result from array to list; does anyone care?
|
|||
|
|
|||
|
def __str__(self):
|
|||
|
return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.index.shape[1])
|
|||
|
|
|||
|
|
|||
|
class SoftCosineSimilarity(interfaces.SimilarityABC):
|
|||
|
"""Compute soft cosine similarity against a corpus of documents by storing the index matrix in memory.
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> from gensim.test.utils import common_texts
|
|||
|
>>> from gensim.corpora import Dictionary
|
|||
|
>>> from gensim.models import Word2Vec
|
|||
|
>>> from gensim.similarities import SoftCosineSimilarity
|
|||
|
>>>
|
|||
|
>>> model = Word2Vec(common_texts, size=20, min_count=1) # train word-vectors
|
|||
|
>>> dictionary = Dictionary(common_texts)
|
|||
|
>>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts]
|
|||
|
>>>
|
|||
|
>>> similarity_matrix = model.wv.similarity_matrix(dictionary) # construct similarity matrix
|
|||
|
>>> index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10)
|
|||
|
>>>
|
|||
|
>>> # Make a query.
|
|||
|
>>> query = 'graph trees computer'.split()
|
|||
|
>>> # calculate similarity between query and each doc from bow_corpus
|
|||
|
>>> sims = index[dictionary.doc2bow(query)]
|
|||
|
|
|||
|
Check out `Tutorial Notebook
|
|||
|
<https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb>`_
|
|||
|
for more examples.
|
|||
|
|
|||
|
"""
|
|||
|
def __init__(self, corpus, similarity_matrix, num_best=None, chunksize=256):
|
|||
|
"""
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
corpus: iterable of list of (int, float)
|
|||
|
A list of documents in the BoW format.
|
|||
|
similarity_matrix : :class:`scipy.sparse.csc_matrix`
|
|||
|
A term similarity matrix, typically produced by
|
|||
|
:meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix`.
|
|||
|
num_best : int, optional
|
|||
|
The number of results to retrieve for a query, if None - return similarities with all elements from corpus.
|
|||
|
chunksize: int, optional
|
|||
|
Size of one corpus chunk.
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
:meth:`gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix`
|
|||
|
A term similarity matrix produced from term embeddings.
|
|||
|
:func:`gensim.matutils.softcossim`
|
|||
|
The Soft Cosine Measure.
|
|||
|
|
|||
|
"""
|
|||
|
self.corpus = corpus
|
|||
|
self.similarity_matrix = similarity_matrix
|
|||
|
self.num_best = num_best
|
|||
|
self.chunksize = chunksize
|
|||
|
|
|||
|
# Normalization of features is undesirable, since soft cosine similarity requires special
|
|||
|
# normalization using the similarity matrix. Therefore, we would just be normalizing twice,
|
|||
|
# increasing the numerical error.
|
|||
|
self.normalize = False
|
|||
|
|
|||
|
# index is simply an array from 0 to size of corpus.
|
|||
|
self.index = numpy.arange(len(corpus))
|
|||
|
|
|||
|
def __len__(self):
|
|||
|
return len(self.corpus)
|
|||
|
|
|||
|
def get_similarities(self, query):
|
|||
|
"""Get similarity between `query` and this index.
|
|||
|
|
|||
|
Warnings
|
|||
|
--------
|
|||
|
Do not use this function directly; use the `self[query]` syntax instead.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
query : {list of (int, number), iterable of list of (int, number)
|
|||
|
Document or collection of documents.
|
|||
|
|
|||
|
Return
|
|||
|
------
|
|||
|
:class:`numpy.ndarray`
|
|||
|
Similarity matrix.
|
|||
|
|
|||
|
"""
|
|||
|
|
|||
|
is_corpus, query = utils.is_corpus(query)
|
|||
|
if not is_corpus:
|
|||
|
if isinstance(query, numpy.ndarray):
|
|||
|
# Convert document indexes to actual documents.
|
|||
|
query = [self.corpus[i] for i in query]
|
|||
|
else:
|
|||
|
query = [query]
|
|||
|
|
|||
|
result = []
|
|||
|
for query_document in query:
|
|||
|
# Compute similarity for each query.
|
|||
|
qresult = [matutils.softcossim(query_document, corpus_document, self.similarity_matrix)
|
|||
|
for corpus_document in self.corpus]
|
|||
|
qresult = numpy.array(qresult)
|
|||
|
|
|||
|
# Append single query result to list of all results.
|
|||
|
result.append(qresult)
|
|||
|
|
|||
|
if is_corpus:
|
|||
|
result = numpy.array(result)
|
|||
|
else:
|
|||
|
result = result[0]
|
|||
|
|
|||
|
return result
|
|||
|
|
|||
|
def __str__(self):
|
|||
|
return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.similarity_matrix.shape[0])
|
|||
|
|
|||
|
|
|||
|
class WmdSimilarity(interfaces.SimilarityABC):
|
|||
|
"""Compute negative WMD similarity against a corpus of documents by storing the index matrix in memory.
|
|||
|
|
|||
|
See :class:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors` for more information.
|
|||
|
Also, tutorial `notebook
|
|||
|
<https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/WMD_tutorial.ipynb>`_ for more examples.
|
|||
|
|
|||
|
When using this code, please consider citing the following papers:
|
|||
|
|
|||
|
* `Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching"
|
|||
|
<http://www.cs.huji.ac.il/~werman/Papers/ECCV2008.pdf>`_
|
|||
|
* `Ofir Pele and Michael Werman, "Fast and robust earth mover's distances"
|
|||
|
<http://www.cs.huji.ac.il/~werman/Papers/ICCV2009.pdf>`_
|
|||
|
* `Matt Kusner et al. "From Word Embeddings To Document Distances"
|
|||
|
<http://proceedings.mlr.press/v37/kusnerb15.pdf>`_
|
|||
|
|
|||
|
Example
|
|||
|
-------
|
|||
|
>>> from gensim.test.utils import common_texts
|
|||
|
>>> from gensim.corpora import Dictionary
|
|||
|
>>> from gensim.models import Word2Vec
|
|||
|
>>> from gensim.similarities import WmdSimilarity
|
|||
|
>>>
|
|||
|
>>> model = Word2Vec(common_texts, size=20, min_count=1) # train word-vectors
|
|||
|
>>> dictionary = Dictionary(common_texts)
|
|||
|
>>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts]
|
|||
|
>>>
|
|||
|
>>> index = WmdSimilarity(bow_corpus, model)
|
|||
|
>>> # Make query.
|
|||
|
>>> query = 'trees'
|
|||
|
>>> sims = index[query]
|
|||
|
|
|||
|
"""
|
|||
|
def __init__(self, corpus, w2v_model, num_best=None, normalize_w2v_and_replace=True, chunksize=256):
|
|||
|
"""
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
corpus: iterable of list of (int, float)
|
|||
|
A list of documents in the BoW format.
|
|||
|
w2v_model: :class:`~gensim.models.word2vec.Word2VecTrainables`
|
|||
|
A trained word2vec model.
|
|||
|
num_best: int, optional
|
|||
|
Number of results to retrieve.
|
|||
|
normalize_w2v_and_replace: bool, optional
|
|||
|
Whether or not to normalize the word2vec vectors to length 1.
|
|||
|
chunksize : int, optional
|
|||
|
Size of chunk.
|
|||
|
|
|||
|
"""
|
|||
|
self.corpus = corpus
|
|||
|
self.w2v_model = w2v_model
|
|||
|
self.num_best = num_best
|
|||
|
self.chunksize = chunksize
|
|||
|
|
|||
|
# Normalization of features is not possible, as corpus is a list (of lists) of strings.
|
|||
|
self.normalize = False
|
|||
|
|
|||
|
# index is simply an array from 0 to size of corpus.
|
|||
|
self.index = numpy.arange(len(corpus))
|
|||
|
|
|||
|
if normalize_w2v_and_replace:
|
|||
|
# Normalize vectors in word2vec class to length 1.
|
|||
|
w2v_model.init_sims(replace=True)
|
|||
|
|
|||
|
def __len__(self):
|
|||
|
"""Get size of corpus."""
|
|||
|
return len(self.corpus)
|
|||
|
|
|||
|
def get_similarities(self, query):
|
|||
|
"""Get similarity between `query` and this index.
|
|||
|
|
|||
|
Warnings
|
|||
|
--------
|
|||
|
Do not use this function directly; use the `self[query]` syntax instead.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
query : {list of (int, number), iterable of list of (int, number)
|
|||
|
Document or collection of documents.
|
|||
|
|
|||
|
Return
|
|||
|
------
|
|||
|
:class:`numpy.ndarray`
|
|||
|
Similarity matrix.
|
|||
|
|
|||
|
"""
|
|||
|
if isinstance(query, numpy.ndarray):
|
|||
|
# Convert document indexes to actual documents.
|
|||
|
query = [self.corpus[i] for i in query]
|
|||
|
|
|||
|
if not query or not isinstance(query[0], list):
|
|||
|
query = [query]
|
|||
|
|
|||
|
n_queries = len(query)
|
|||
|
result = []
|
|||
|
for qidx in range(n_queries):
|
|||
|
# Compute similarity for each query.
|
|||
|
qresult = [self.w2v_model.wmdistance(document, query[qidx]) for document in self.corpus]
|
|||
|
qresult = numpy.array(qresult)
|
|||
|
qresult = 1. / (1. + qresult) # Similarity is the negative of the distance.
|
|||
|
|
|||
|
# Append single query result to list of all results.
|
|||
|
result.append(qresult)
|
|||
|
|
|||
|
if len(result) == 1:
|
|||
|
# Only one query.
|
|||
|
result = result[0]
|
|||
|
else:
|
|||
|
result = numpy.array(result)
|
|||
|
|
|||
|
return result
|
|||
|
|
|||
|
def __str__(self):
|
|||
|
return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.w2v_model.wv.syn0.shape[1])
|
|||
|
|
|||
|
|
|||
|
class SparseMatrixSimilarity(interfaces.SimilarityABC):
|
|||
|
"""Compute cosine similarity against a corpus of documents by storing the index matrix in memory.
|
|||
|
|
|||
|
Notes
|
|||
|
-----
|
|||
|
Use this if your input corpus contains sparse vectors (such as TF-IDF documents) and fits into RAM.
|
|||
|
|
|||
|
The matrix is internally stored as a :class:`scipy.sparse.csr_matrix` matrix. Unless the entire
|
|||
|
matrix fits into main memory, use :class:`~gensim.similarities.docsim.Similarity` instead.
|
|||
|
|
|||
|
Takes an optional `maintain_sparsity` argument, setting this to True
|
|||
|
causes `get_similarities` to return a sparse matrix instead of a
|
|||
|
dense representation if possible.
|
|||
|
|
|||
|
See also
|
|||
|
--------
|
|||
|
:class:`~gensim.similarities.docsim.Similarity`
|
|||
|
Index similarity (wrapper for other inheritors of :class:`~gensim.interfaces.SimilarityABC`).
|
|||
|
:class:`~gensim.similarities.docsim.MatrixSimilarity`
|
|||
|
Index similarity (dense with cosine distance).
|
|||
|
|
|||
|
"""
|
|||
|
def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None,
|
|||
|
num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False):
|
|||
|
"""
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
corpus: iterable of list of (int, float)
|
|||
|
A list of documents in the BoW format.
|
|||
|
num_features : int, optional
|
|||
|
Size of the dictionary. Must be either specified, or present in `corpus.num_terms`.
|
|||
|
num_terms : int, optional
|
|||
|
Alias for `num_features`, you can use either.
|
|||
|
num_docs : int, optional
|
|||
|
Number of documents in `corpus`. Will be calculated if not provided.
|
|||
|
num_nnz : int, optional
|
|||
|
Number of non-zero elements in `corpus`. Will be calculated if not provided.
|
|||
|
num_best : int, optional
|
|||
|
If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0.
|
|||
|
Otherwise, return a full vector with one float for every document in the index.
|
|||
|
chunksize : int, optional
|
|||
|
Size of query chunks. Used internally when the query is an entire corpus.
|
|||
|
dtype : numpy.dtype, optional
|
|||
|
Data type of the internal matrix.
|
|||
|
maintain_sparsity : bool, optional
|
|||
|
Return sparse arrays from :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`?
|
|||
|
|
|||
|
"""
|
|||
|
self.num_best = num_best
|
|||
|
self.normalize = True
|
|||
|
self.chunksize = chunksize
|
|||
|
self.maintain_sparsity = maintain_sparsity
|
|||
|
|
|||
|
if corpus is not None:
|
|||
|
logger.info("creating sparse index")
|
|||
|
|
|||
|
# iterate over input corpus, populating the sparse index matrix
|
|||
|
try:
|
|||
|
# use the more efficient corpus generation version, if the input
|
|||
|
# `corpus` is MmCorpus-like (knows its shape and number of non-zeroes).
|
|||
|
num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz
|
|||
|
logger.debug("using efficient sparse index creation")
|
|||
|
except AttributeError:
|
|||
|
# no MmCorpus, use the slower version (or maybe user supplied the
|
|||
|
# num_* params in constructor)
|
|||
|
pass
|
|||
|
if num_features is not None:
|
|||
|
# num_terms is just an alias for num_features, for compatibility with MatrixSimilarity
|
|||
|
num_terms = num_features
|
|||
|
if num_terms is None:
|
|||
|
raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly")
|
|||
|
corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else
|
|||
|
(matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else
|
|||
|
matutils.unitvec(v)) for v in corpus)
|
|||
|
self.index = matutils.corpus2csc(
|
|||
|
corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz,
|
|||
|
dtype=dtype, printprogress=10000
|
|||
|
).T
|
|||
|
|
|||
|
# convert to Compressed Sparse Row for efficient row slicing and multiplications
|
|||
|
self.index = self.index.tocsr() # currently no-op, CSC.T is already CSR
|
|||
|
logger.info("created %r", self.index)
|
|||
|
|
|||
|
def __len__(self):
|
|||
|
"""Get size of index."""
|
|||
|
return self.index.shape[0]
|
|||
|
|
|||
|
def get_similarities(self, query):
|
|||
|
"""Get similarity between `query` and this index.
|
|||
|
|
|||
|
Warnings
|
|||
|
--------
|
|||
|
Do not use this function directly; use the `self[query]` syntax instead.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`
|
|||
|
Document or collection of documents.
|
|||
|
|
|||
|
Return
|
|||
|
------
|
|||
|
:class:`numpy.ndarray`
|
|||
|
Similarity matrix (if maintain_sparsity=False) **OR**
|
|||
|
:class:`scipy.sparse.csc`
|
|||
|
otherwise
|
|||
|
|
|||
|
"""
|
|||
|
is_corpus, query = utils.is_corpus(query)
|
|||
|
if is_corpus:
|
|||
|
query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype)
|
|||
|
else:
|
|||
|
if scipy.sparse.issparse(query):
|
|||
|
query = query.T # convert documents=rows to documents=columns
|
|||
|
elif isinstance(query, numpy.ndarray):
|
|||
|
if query.ndim == 1:
|
|||
|
query.shape = (1, len(query))
|
|||
|
query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T
|
|||
|
else:
|
|||
|
# default case: query is a single vector, in sparse gensim format
|
|||
|
query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype)
|
|||
|
|
|||
|
# compute cosine similarity against every other document in the collection
|
|||
|
result = self.index * query.tocsc() # N x T * T x C = N x C
|
|||
|
if result.shape[1] == 1 and not is_corpus:
|
|||
|
# for queries of one document, return a 1d array
|
|||
|
result = result.toarray().flatten()
|
|||
|
elif self.maintain_sparsity:
|
|||
|
# avoid converting to dense array if maintaining sparsity
|
|||
|
result = result.T
|
|||
|
else:
|
|||
|
# otherwise, return a 2d matrix (#queries x #index)
|
|||
|
result = result.toarray().T
|
|||
|
return result
|