laywerrobot/lib/python3.6/site-packages/gensim/models/ldamodel.py
2020-08-27 21:55:39 +02:00

1646 lines
72 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Optimized `Latent Dirichlet Allocation (LDA) <https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation>` in Python.
For a faster implementation of LDA (parallelized for multicore machines), see also :mod:`gensim.models.ldamulticore`.
This module allows both LDA model estimation from a training corpus and inference of topic
distribution on new, unseen documents. The model can also be updated with new documents
for online training.
The core estimation code is based on the `onlineldavb.py script
<https://github.com/blei-lab/onlineldavb/blob/master/onlineldavb.py>`_, by `Hoffman, Blei, Bach:
Online Learning for Latent Dirichlet Allocation, NIPS 2010 <http://www.cs.princeton.edu/~mdhoffma>`_.
The algorithm:
#. Is **streamed**: training documents may come in sequentially, no random access required.
#. Runs in **constant memory** w.r.t. the number of documents: size of the training corpus does not affect memory
footprint, can process corpora larger than RAM.
#. Is **distributed**: makes use of a cluster of machines, if available, to speed up model estimation.
Usage examples
--------------
Train an LDA model using a Gensim corpus
>>> from gensim.test.utils import common_texts
>>> from gensim.corpora.dictionary import Dictionary
>>>
>>> # Create a corpus from a list of texts
>>> common_dictionary = Dictionary(common_texts)
>>> common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
>>>
>>> # Train the model on the corpus.
>>> lda = LdaModel(common_corpus, num_topics=10)
Save a model to disk, or reload a pre-trained model
>>> from gensim.test.utils import datapath
>>>
>>> # Save model to disk.
>>> temp_file = datapath("model")
>>> lda.save(temp_file)
>>>
>>> # Load a potentially pretrained model from disk.
>>> lda = LdaModel.load(temp_file)
Query, the model using new, unseen documents
>>> # Create a new corpus, made of previously unseen documents.
>>> other_texts = [
... ['computer', 'time', 'graph'],
... ['survey', 'response', 'eps'],
... ['human', 'system', 'computer']
... ]
>>> other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
>>>
>>> unseen_doc = other_corpus[0]
>>> vector = lda[unseen_doc] # get topic probability distribution for a document
Update the model by incrementally training on the new corpus
>>> lda.update(other_corpus)
>>> vector = lda[unseen_doc]
A lot of parameters can be tuned to optimize training for your specific case
>>> lda = LdaModel(common_corpus, num_topics=50, alpha='auto', eval_every=5) # learn asymmetric alpha from data
"""
import logging
import numbers
import os
import numpy as np
import six
from scipy.special import gammaln, psi # gamma function utils
from scipy.special import polygamma
from six.moves import xrange
from collections import defaultdict
from gensim import interfaces, utils, matutils
from gensim.matutils import (
kullback_leibler, hellinger, jaccard_distance, jensen_shannon,
dirichlet_expectation, logsumexp, mean_absolute_difference
)
from gensim.models import basemodel, CoherenceModel
from gensim.models.callbacks import Callback
logger = logging.getLogger(__name__)
# Epsilon (very small) values used by each expected data type instead of 0, to avoid Arithmetic Errors.
DTYPE_TO_EPS = {
np.float16: 1e-5,
np.float32: 1e-35,
np.float64: 1e-100,
}
def update_dir_prior(prior, N, logphat, rho):
"""Update a given prior using Newton's method, described in
`J. Huang: "Maximum Likelihood Estimation of Dirichlet Distribution Parameters"
<http://jonathan-huang.org/research/dirichlet/dirichlet.pdf>`_.
Parameters
----------
prior : list of float
The prior for each possible outcome at the previous iteration (to be updated).
N : int
Number of observations.
logphat : list of float
Log probabilities for the current estimation, also called "observed sufficient statistics".
rho : float
Learning rate.
Returns
-------
list of float
The updated prior.
"""
gradf = N * (psi(np.sum(prior)) - psi(prior) + logphat)
c = N * polygamma(1, np.sum(prior))
q = -N * polygamma(1, prior)
b = np.sum(gradf / q) / (1 / c + np.sum(1 / q))
dprior = -(gradf - b) / q
if all(rho * dprior + prior > 0):
prior += rho * dprior
else:
logger.warning("updated prior not positive")
return prior
class LdaState(utils.SaveLoad):
"""Encapsulate information for distributed computation of :class:`~gensim.models.ldamodel.LdaModel` objects.
Objects of this class are sent over the network, so try to keep them lean to
reduce traffic.
"""
def __init__(self, eta, shape, dtype=np.float32):
"""
Parameters
----------
eta : numpy.ndarray
The prior probabilities assigned to each term.
shape : tuple of (int, int)
Shape of the sufficient statistics: (number of topics to be found, number of terms in the vocabulary).
dtype : type
Overrides the numpy array default types.
"""
self.eta = eta.astype(dtype, copy=False)
self.sstats = np.zeros(shape, dtype=dtype)
self.numdocs = 0
self.dtype = dtype
def reset(self):
"""Prepare the state for a new EM iteration (reset sufficient stats)."""
self.sstats[:] = 0.0
self.numdocs = 0
def merge(self, other):
"""Merge the result of an E step from one node with that of another node (summing up sufficient statistics).
The merging is trivial and after merging all cluster nodes, we have the
exact same result as if the computation was run on a single node (no
approximation).
Parameters
----------
other : :class:`~gensim.models.ldamodel.LdaState`
The state object with which the current one will be merged.
"""
assert other is not None
self.sstats += other.sstats
self.numdocs += other.numdocs
def blend(self, rhot, other, targetsize=None):
"""Merge the current state with another one using a weighted average for the sufficient statistics.
The number of documents is stretched in both state objects, so that they are of comparable magnitude.
This procedure corresponds to the stochastic gradient update from
`Hoffman et al. :"Online Learning for Latent Dirichlet Allocation"
<https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_, see equations (5) and (9).
Parameters
----------
rhot : float
Weight of the `other` state in the computed average. A value of 0.0 means that `other`
is completely ignored. A value of 1.0 means `self` is completely ignored.
other : :class:`~gensim.models.ldamodel.LdaState`
The state object with which the current one will be merged.
targetsize : int, optional
The number of documents to stretch both states to.
"""
assert other is not None
if targetsize is None:
targetsize = self.numdocs
# stretch the current model's expected n*phi counts to target size
if self.numdocs == 0 or targetsize == self.numdocs:
scale = 1.0
else:
scale = 1.0 * targetsize / self.numdocs
self.sstats *= (1.0 - rhot) * scale
# stretch the incoming n*phi counts to target size
if other.numdocs == 0 or targetsize == other.numdocs:
scale = 1.0
else:
logger.info("merging changes from %i documents into a model of %i documents", other.numdocs, targetsize)
scale = 1.0 * targetsize / other.numdocs
self.sstats += rhot * scale * other.sstats
self.numdocs = targetsize
def blend2(self, rhot, other, targetsize=None):
"""Merge the current state with another one using a weighted sum for the sufficient statistics.
In contrast to :meth:`~gensim.models.ldamodel.LdaState.blend`, the sufficient statistics are not scaled
prior to aggregation.
Parameters
----------
rhot : float
Unused.
other : :class:`~gensim.models.ldamodel.LdaState`
The state object with which the current one will be merged.
targetsize : int, optional
The number of documents to stretch both states to.
"""
assert other is not None
if targetsize is None:
targetsize = self.numdocs
# merge the two matrices by summing
self.sstats += other.sstats
self.numdocs = targetsize
def get_lambda(self):
"""Get the parameters of the posterior over the topics, also referred to as "the topics".
Returns
-------
numpy.ndarray
Parameters of the posterior probability over topics.
"""
return self.eta + self.sstats
def get_Elogbeta(self):
"""Get the log (posterior) probabilities for each topic.
Returns
-------
numpy.ndarray
Posterior probabilities for each topic.
"""
return dirichlet_expectation(self.get_lambda())
@classmethod
def load(cls, fname, *args, **kwargs):
"""Load a previously stored state from disk.
Overrides :class:`~gensim.utils.SaveLoad.load` by enforcing the `dtype` parameter
to ensure backwards compatibility.
Parameters
----------
fname : str
Path to file that contains the needed object.
args : object
Positional parameters to be propagated to class:`~gensim.utils.SaveLoad.load`
kwargs : object
Key-word parameters to be propagated to class:`~gensim.utils.SaveLoad.load`
Returns
-------
:class:`~gensim.models.ldamodel.LdaState`
The state loaded from the given file.
"""
result = super(LdaState, cls).load(fname, *args, **kwargs)
# dtype could be absent in old models
if not hasattr(result, 'dtype'):
result.dtype = np.float64 # float64 was implicitly used before (because it's the default in numpy)
logging.info("dtype was not set in saved %s file %s, assuming np.float64", result.__class__.__name__, fname)
return result
class LdaModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
"""Train and use Online Latent Dirichlet Allocation (OLDA) models as presented in
`Hoffman et al. :"Online Learning for Latent Dirichlet Allocation" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
Examples
-------
Initialize a model using a Gensim corpus
>>> from gensim.test.utils import common_corpus
>>>
>>> lda = LdaModel(common_corpus, num_topics=10)
You can then infer topic distributions on new, unseen documents.
>>> doc_bow = [(1, 0.3), (2, 0.1), (0, 0.09)]
>>> doc_lda = lda[doc_bow]
The model can be updated (trained) with new documents.
>>> # In practice (corpus =/= initial training corpus), but we use the same here for simplicity.
>>> other_corpus = common_corpus
>>>
>>> lda.update(other_corpus)
Model persistency is achieved through :meth:`~gensim.models.ldamodel.LdaModel.load` and
:meth:`~gensim.models.ldamodel.LdaModel.save` methods.
"""
def __init__(self, corpus=None, num_topics=100, id2word=None,
distributed=False, chunksize=2000, passes=1, update_every=1,
alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10,
iterations=50, gamma_threshold=0.001, minimum_probability=0.01,
random_state=None, ns_conf=None, minimum_phi_value=0.01,
per_word_topics=False, callbacks=None, dtype=np.float32):
"""
Parameters
----------
corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`).
If not given, the model is left untrained (presumably because you want to call
:meth:`~gensim.models.ldamodel.LdaModel.update` manually).
num_topics : int, optional
The number of requested latent topics to be extracted from the training corpus.
id2word : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}
Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for
debugging and topic printing.
distributed : bool, optional
Whether distributed computing should be used to accerelate training.
chunksize : int, optional
Number of documents to be used in each training chunk.
passes : int, optional
Number of passes through the corpus during training.
update_every : int, optional
Number of documents to be iterated through for each update.
Set to 0 for batch learning, > 1 for online iterative learning.
alpha : {numpy.ndarray, str}, optional
Can be set to an 1D array of length equal to the number of expected topics that expresses
our a-priori belief for the each topics' probability.
Alternatively default prior selecting strategies can be employed by supplying a string:
* 'asymmetric': Uses a fixed normalized asymmetric prior of `1.0 / topicno`.
* 'default': Learns an asymmetric prior from the corpus.
eta : {float, np.array, str}, optional
A-priori belief on word probability, this can be:
* scalar for a symmetric prior over topic/word probability,
* vector of length num_words to denote an asymmetric user defined probability for each word,
* matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination,
* the string 'auto' to learn the asymmetric prior from the data.
decay : float, optional
A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten
when each new document is examined. Corresponds to Kappa from
`Matthew D. Hoffman, David M. Blei, Francis Bach:
"Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
offset : float, optional
Hyper-parameter that controls how much we will slow down the first steps the first few iterations.
Corresponds to Tau_0 from `Matthew D. Hoffman, David M. Blei, Francis Bach:
"Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
eval_every : int, optional
Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x.
iterations : int, optional
Maximum number of iterations through the corpus when inferring the topic distribution of a corpus.
gamma_threshold : float, optional
Minimum change in the value of the gamma parameters to continue iterating.
minimum_probability : float, optional
Topics with a probability lower than this threshold will be filtered out.
random_state : {np.random.RandomState, int}, optional
Either a randomState object or a seed to generate one. Useful for reproducibility.
ns_conf : dict of (str, object), optional
Key word parameters propagated to :func:`gensim.utils.getNS` to get a Pyro4 Nameserved.
Only used if `distributed` is set to True.
minimum_phi_value : float, optional
if `per_word_topics` is True, this represents a lower bound on the term probabilities.
per_word_topics : bool
If True, the model also computes a list of topics, sorted in descending order of most likely topics for
each word, along with their phi values multiplied by the feature length (i.e. word count).
callbacks : list of :class:`~gensim.models.callbacks.Callback`
Metric callbacks to log and visualize evaluation metrics of the model during training.
dtype : {numpy.float16, numpy.float32, numpy.float64}, optional
Data-type to use during calculations inside model. All inputs are also converted.
"""
if dtype not in DTYPE_TO_EPS:
raise ValueError(
"Incorrect 'dtype', please choose one of {}".format(
", ".join("numpy.{}".format(tp.__name__) for tp in sorted(DTYPE_TO_EPS))))
self.dtype = dtype
# store user-supplied parameters
self.id2word = id2word
if corpus is None and self.id2word is None:
raise ValueError(
'at least one of corpus/id2word must be specified, to establish input space dimensionality'
)
if self.id2word is None:
logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
self.id2word = utils.dict_from_corpus(corpus)
self.num_terms = len(self.id2word)
elif len(self.id2word) > 0:
self.num_terms = 1 + max(self.id2word.keys())
else:
self.num_terms = 0
if self.num_terms == 0:
raise ValueError("cannot compute LDA over an empty collection (no terms)")
self.distributed = bool(distributed)
self.num_topics = int(num_topics)
self.chunksize = chunksize
self.decay = decay
self.offset = offset
self.minimum_probability = minimum_probability
self.num_updates = 0
self.passes = passes
self.update_every = update_every
self.eval_every = eval_every
self.minimum_phi_value = minimum_phi_value
self.per_word_topics = per_word_topics
self.callbacks = callbacks
self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')
assert self.alpha.shape == (self.num_topics,), \
"Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)
if isinstance(eta, six.string_types):
if eta == 'asymmetric':
raise ValueError("The 'asymmetric' option cannot be used for eta")
self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')
self.random_state = utils.get_random_state(random_state)
assert self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms), (
"Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
(str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms))
# VB constants
self.iterations = iterations
self.gamma_threshold = gamma_threshold
# set up distributed environment if necessary
if not distributed:
logger.info("using serial LDA version on this node")
self.dispatcher = None
self.numworkers = 1
else:
if self.optimize_alpha:
raise NotImplementedError("auto-optimizing alpha not implemented in distributed LDA")
# set up distributed version
try:
import Pyro4
if ns_conf is None:
ns_conf = {}
with utils.getNS(**ns_conf) as ns:
from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX
self.dispatcher = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX])
logger.debug("looking for dispatcher at %s" % str(self.dispatcher._pyroUri))
self.dispatcher.initialize(
id2word=self.id2word, num_topics=self.num_topics, chunksize=chunksize,
alpha=alpha, eta=eta, distributed=False
)
self.numworkers = len(self.dispatcher.getworkers())
logger.info("using distributed version with %i workers", self.numworkers)
except Exception as err:
logger.error("failed to initialize distributed LDA (%s)", err)
raise RuntimeError("failed to initialize distributed LDA (%s)" % err)
# Initialize the variational distribution q(beta|lambda)
self.state = LdaState(self.eta, (self.num_topics, self.num_terms), dtype=self.dtype)
self.state.sstats[...] = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))
# Check that we haven't accidentally fallen back to np.float64
assert self.eta.dtype == self.dtype
assert self.expElogbeta.dtype == self.dtype
# if a training corpus was provided, start estimating the model right away
if corpus is not None:
use_numpy = self.dispatcher is not None
self.update(corpus, chunks_as_numpy=use_numpy)
def init_dir_prior(self, prior, name):
"""Initialize priors for the Dirichlet distribution.
Parameters
----------
prior : {str, list of float, numpy.ndarray of float, float}
A-priori belief on word probability. If `name` == 'eta' then the prior can be:
* scalar for a symmetric prior over topic/word probability,
* vector of length num_words to denote an asymmetric user defined probability for each word,
* matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination,
* the string 'auto' to learn the asymmetric prior from the data.
If `name` == 'alpha', then the prior can be:
* an 1D array of length equal to the number of expected topics,
* 'asymmetric': Uses a fixed normalized assymetric prior of `1.0 / topicno`.
* 'default': Learns an assymetric prior from the corpus.
name : {'alpha', 'eta'}
Whether the `prior` is parameterized by the alpha vector (1 parameter per topic)
or by the eta (1 parameter per unique term in the vocabulary).
"""
if prior is None:
prior = 'symmetric'
if name == 'alpha':
prior_shape = self.num_topics
elif name == 'eta':
prior_shape = self.num_terms
else:
raise ValueError("'name' must be 'alpha' or 'eta'")
is_auto = False
if isinstance(prior, six.string_types):
if prior == 'symmetric':
logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics)
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype)
elif prior == 'asymmetric':
init_prior = \
np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)], dtype=self.dtype)
init_prior /= init_prior.sum()
logger.info("using asymmetric %s %s", name, list(init_prior))
elif prior == 'auto':
is_auto = True
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype)
if name == 'alpha':
logger.info("using autotuned %s, starting with %s", name, list(init_prior))
else:
raise ValueError("Unable to determine proper %s value given '%s'" % (name, prior))
elif isinstance(prior, list):
init_prior = np.asarray(prior, dtype=self.dtype)
elif isinstance(prior, np.ndarray):
init_prior = prior.astype(self.dtype, copy=False)
elif isinstance(prior, np.number) or isinstance(prior, numbers.Real):
init_prior = np.asarray([prior] * prior_shape, dtype=self.dtype)
else:
raise ValueError("%s must be either a np array of scalars, list of scalars, or scalar" % name)
return init_prior, is_auto
def __str__(self):
"""Get a string representation of the current object.
Returns
-------
str
Human readable representation of the most important model parameters.
"""
return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % (
self.num_terms, self.num_topics, self.decay, self.chunksize
)
def sync_state(self):
"""Propagate the states topic probabilities to the inner object's attribute."""
self.expElogbeta = np.exp(self.state.get_Elogbeta())
assert self.expElogbeta.dtype == self.dtype
def clear(self):
"""Clear the model's state to free some memory. Used in the distributed implementation."""
self.state = None
self.Elogbeta = None
def inference(self, chunk, collect_sstats=False):
"""Given a chunk of sparse document vectors, estimate gamma (parameters controlling the topic weights)
for each document in the chunk.
This function does not modify the model The whole input chunk of document is assumed to fit in RAM;
chunking of a large corpus must be done earlier in the pipeline. Avoids computing the `phi` variational
parameter directly using the optimization presented in
`Lee, Seung: Algorithms for non-negative matrix factorization"
<https://papers.nips.cc/paper/1861-algorithms-for-non-negative-matrix-factorization.pdf>`_.
Parameters
----------
chunk : {list of list of (int, float), scipy.sparse.csc}
The corpus chunk on which the inference step will be performed.
collect_sstats : bool, optional
If set to True, also collect (and return) sufficient statistics needed to update the model's topic-word
distributions.
Returns
-------
(numpy.ndarray, {numpy.ndarray, None})
The first element is always returned and it corresponds to the states gamma matrix. The second element is
only returned if `collect_sstats` == True and corresponds to the sufficient statistics for the M step.
"""
try:
len(chunk)
except TypeError:
# convert iterators/generators to plain list, so we have len() etc.
chunk = list(chunk)
if len(chunk) > 1:
logger.debug("performing inference on a chunk of %i documents", len(chunk))
# Initialize the variational distribution q(theta|gamma) for the chunk
gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)).astype(self.dtype, copy=False)
Elogtheta = dirichlet_expectation(gamma)
expElogtheta = np.exp(Elogtheta)
assert Elogtheta.dtype == self.dtype
assert expElogtheta.dtype == self.dtype
if collect_sstats:
sstats = np.zeros_like(self.expElogbeta, dtype=self.dtype)
else:
sstats = None
converged = 0
# Now, for each document d update that document's gamma and phi
# Inference code copied from Hoffman's `onlineldavb.py` (esp. the
# Lee&Seung trick which speeds things up by an order of magnitude, compared
# to Blei's original LDA-C code, cool!).
for d, doc in enumerate(chunk):
if len(doc) > 0 and not isinstance(doc[0][0], six.integer_types + (np.integer,)):
# make sure the term IDs are ints, otherwise np will get upset
ids = [int(idx) for idx, _ in doc]
else:
ids = [idx for idx, _ in doc]
cts = np.array([cnt for _, cnt in doc], dtype=self.dtype)
gammad = gamma[d, :]
Elogthetad = Elogtheta[d, :]
expElogthetad = expElogtheta[d, :]
expElogbetad = self.expElogbeta[:, ids]
# The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w.
# phinorm is the normalizer.
# TODO treat zeros explicitly, instead of adding epsilon?
eps = DTYPE_TO_EPS[self.dtype]
phinorm = np.dot(expElogthetad, expElogbetad) + eps
# Iterate between gamma and phi until convergence
for _ in xrange(self.iterations):
lastgamma = gammad
# We represent phi implicitly to save memory and time.
# Substituting the value of the optimal phi back into
# the update for gamma gives this update. Cf. Lee&Seung 2001.
gammad = self.alpha + expElogthetad * np.dot(cts / phinorm, expElogbetad.T)
Elogthetad = dirichlet_expectation(gammad)
expElogthetad = np.exp(Elogthetad)
phinorm = np.dot(expElogthetad, expElogbetad) + eps
# If gamma hasn't changed much, we're done.
meanchange = mean_absolute_difference(gammad, lastgamma)
if meanchange < self.gamma_threshold:
converged += 1
break
gamma[d, :] = gammad
assert gammad.dtype == self.dtype
if collect_sstats:
# Contribution of document d to the expected sufficient
# statistics for the M step.
sstats[:, ids] += np.outer(expElogthetad.T, cts / phinorm)
if len(chunk) > 1:
logger.debug("%i/%i documents converged within %i iterations", converged, len(chunk), self.iterations)
if collect_sstats:
# This step finishes computing the sufficient statistics for the
# M step, so that
# sstats[k, w] = \sum_d n_{dw} * phi_{dwk}
# = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.
sstats *= self.expElogbeta
assert sstats.dtype == self.dtype
assert gamma.dtype == self.dtype
return gamma, sstats
def do_estep(self, chunk, state=None):
"""Perform inference on a chunk of documents, and accumulate the collected sufficient statistics.
Parameters
----------
chunk : {list of list of (int, float), scipy.sparse.csc}
The corpus chunk on which the inference step will be performed.
state : :class:`~gensim.models.ldamodel.LdaState`, optional
The state to be updated with the newly accumulated sufficient statistics. If none, the models
`self.state` is updated.
Returns
-------
numpy.ndarray
Gamma parameters controlling the topic weights, shape (`len(chunk)`, `self.num_topics`).
"""
if state is None:
state = self.state
gamma, sstats = self.inference(chunk, collect_sstats=True)
state.sstats += sstats
state.numdocs += gamma.shape[0] # avoids calling len(chunk) on a generator
assert gamma.dtype == self.dtype
return gamma
def update_alpha(self, gammat, rho):
"""Update parameters for the Dirichlet prior on the per-document topic weights.
Parameters
----------
gammat : numpy.ndarray
Previous topic weight parameters.
rho : float
Learning rate.
Returns
-------
numpy.ndarray
Sequence of alpha parameters.
"""
N = float(len(gammat))
logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N
assert logphat.dtype == self.dtype
self.alpha = update_dir_prior(self.alpha, N, logphat, rho)
logger.info("optimized alpha %s", list(self.alpha))
assert self.alpha.dtype == self.dtype
return self.alpha
def update_eta(self, lambdat, rho):
"""Update parameters for the Dirichlet prior on the per-topic word weights.
Parameters
----------
lambdat : numpy.ndarray
Previous lambda parameters.
rho : float
Learning rate.
Returns
-------
numpy.ndarray
The updated eta parameters.
"""
N = float(lambdat.shape[0])
logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,))
assert logphat.dtype == self.dtype
self.eta = update_dir_prior(self.eta, N, logphat, rho)
assert self.eta.dtype == self.dtype
return self.eta
def log_perplexity(self, chunk, total_docs=None):
"""Calculate and return per-word likelihood bound, using a chunk of documents as evaluation corpus.
Also output the calculated statistics, including the perplexity=2^(-bound), to log at INFO level.
Parameters
----------
chunk : {list of list of (int, float), scipy.sparse.csc}
The corpus chunk on which the inference step will be performed.
total_docs : int, optional
Number of docs used for evaluation of the perplexity.
Returns
-------
numpy.ndarray
The variational bound score calculated for each word.
"""
if total_docs is None:
total_docs = len(chunk)
corpus_words = sum(cnt for document in chunk for _, cnt in document)
subsample_ratio = 1.0 * total_docs / len(chunk)
perwordbound = self.bound(chunk, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)
logger.info(
"%.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words",
perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words
)
return perwordbound
def update(self, corpus, chunksize=None, decay=None, offset=None,
passes=None, update_every=None, eval_every=None, iterations=None,
gamma_threshold=None, chunks_as_numpy=False):
"""Train the model with new documents, by EM-iterating over the corpus until the topics converge, or until
the maximum number of allowed iterations is reached. `corpus` must be an iterable.
In distributed mode, the E step is distributed over a cluster of machines.
Notes
-----
This update also supports updating an already trained model with new documents; the two models are then merged
in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary
input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the
online update of `Matthew D. Hoffman, David M. Blei, Francis Bach:
"Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
and is guaranteed to converge for any `decay` in (0.5, 1.0). Additionally, for smaller corpus sizes, an
increasing `offset` may be beneficial (see Table 1 in the same paper).
Parameters
----------
corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`) used to update the
model.
chunksize : int, optional
Number of documents to be used in each training chunk.
decay : float, optional
A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten
when each new document is examined. Corresponds to Kappa from
`Matthew D. Hoffman, David M. Blei, Francis Bach:
"Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
offset : float, optional
Hyper-parameter that controls how much we will slow down the first steps the first few iterations.
Corresponds to Tau_0 from `Matthew D. Hoffman, David M. Blei, Francis Bach:
"Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
passes : int, optional
Number of passes through the corpus during training.
update_every : int, optional
Number of documents to be iterated through for each update.
Set to 0 for batch learning, > 1 for online iterative learning.
eval_every : int, optional
Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x.
iterations : int, optional
Maximum number of iterations through the corpus when inferring the topic distribution of a corpus.
gamma_threshold : float, optional
Minimum change in the value of the gamma parameters to continue iterating.
chunks_as_numpy : bool, optional
Whether each chunk passed to the inference step should be a numpy.ndarray or not. Numpy can in some settings
turn the term IDs into floats, these will be converted back into integers in inference, which incurs a
performance hit. For distributed computing it may be desirable to keep the chunks as `numpy.ndarray`.
"""
# use parameters given in constructor, unless user explicitly overrode them
if decay is None:
decay = self.decay
if offset is None:
offset = self.offset
if passes is None:
passes = self.passes
if update_every is None:
update_every = self.update_every
if eval_every is None:
eval_every = self.eval_every
if iterations is None:
iterations = self.iterations
if gamma_threshold is None:
gamma_threshold = self.gamma_threshold
try:
lencorpus = len(corpus)
except Exception:
logger.warning("input corpus stream has no len(); counting documents")
lencorpus = sum(1 for _ in corpus)
if lencorpus == 0:
logger.warning("LdaModel.update() called with an empty corpus")
return
if chunksize is None:
chunksize = min(lencorpus, self.chunksize)
self.state.numdocs += lencorpus
if update_every:
updatetype = "online"
if passes == 1:
updatetype += " (single-pass)"
else:
updatetype += " (multi-pass)"
updateafter = min(lencorpus, update_every * self.numworkers * chunksize)
else:
updatetype = "batch"
updateafter = lencorpus
evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize)
updates_per_pass = max(1, lencorpus / updateafter)
logger.info(
"running %s LDA training, %s topics, %i passes over "
"the supplied corpus of %i documents, updating model once "
"every %i documents, evaluating perplexity every %i documents, "
"iterating %ix with a convergence threshold of %f",
updatetype, self.num_topics, passes, lencorpus,
updateafter, evalafter, iterations,
gamma_threshold
)
if updates_per_pass * passes < 10:
logger.warning(
"too few updates, training might not converge; "
"consider increasing the number of passes or iterations to improve accuracy"
)
# rho is the "speed" of updating; TODO try other fncs
# pass_ + num_updates handles increasing the starting t for each pass,
# while allowing it to "reset" on the first pass of each update
def rho():
return pow(offset + pass_ + (self.num_updates / chunksize), -decay)
if self.callbacks:
# pass the list of input callbacks to Callback class
callback = Callback(self.callbacks)
callback.set_model(self)
# initialize metrics list to store metric values after every epoch
self.metrics = defaultdict(list)
for pass_ in xrange(passes):
if self.dispatcher:
logger.info('initializing %s workers', self.numworkers)
self.dispatcher.reset(self.state)
else:
other = LdaState(self.eta, self.state.sstats.shape, self.dtype)
dirty = False
reallen = 0
chunks = utils.grouper(corpus, chunksize, as_numpy=chunks_as_numpy, dtype=self.dtype)
for chunk_no, chunk in enumerate(chunks):
reallen += len(chunk) # keep track of how many documents we've processed so far
if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
self.log_perplexity(chunk, total_docs=lencorpus)
if self.dispatcher:
# add the chunk to dispatcher's job queue, so workers can munch on it
logger.info(
"PROGRESS: pass %i, dispatching documents up to #%i/%i",
pass_, chunk_no * chunksize + len(chunk), lencorpus
)
# this will eventually block until some jobs finish, because the queue has a small finite length
self.dispatcher.putjob(chunk)
else:
logger.info(
"PROGRESS: pass %i, at document #%i/%i",
pass_, chunk_no * chunksize + len(chunk), lencorpus
)
gammat = self.do_estep(chunk, other)
if self.optimize_alpha:
self.update_alpha(gammat, rho())
dirty = True
del chunk
# perform an M step. determine when based on update_every, don't do this after every chunk
if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0:
if self.dispatcher:
# distributed mode: wait for all workers to finish
logger.info("reached the end of input; now waiting for all remaining jobs to finish")
other = self.dispatcher.getstate()
self.do_mstep(rho(), other, pass_ > 0)
del other # frees up memory
if self.dispatcher:
logger.info('initializing workers')
self.dispatcher.reset(self.state)
else:
other = LdaState(self.eta, self.state.sstats.shape, self.dtype)
dirty = False
if reallen != lencorpus:
raise RuntimeError("input corpus size changed during training (don't use generators as input)")
# append current epoch's metric values
if self.callbacks:
current_metrics = callback.on_epoch_end(pass_)
for metric, value in current_metrics.items():
self.metrics[metric].append(value)
if dirty:
# finish any remaining updates
if self.dispatcher:
# distributed mode: wait for all workers to finish
logger.info("reached the end of input; now waiting for all remaining jobs to finish")
other = self.dispatcher.getstate()
self.do_mstep(rho(), other, pass_ > 0)
del other
dirty = False
def do_mstep(self, rho, other, extra_pass=False):
"""Maximization step: use linear interpolation between the existing topics and
collected sufficient statistics in `other` to update the topics.
Parameters
----------
rho : float
Learning rate.
other : :class:`~gensim.models.ldamodel.LdaModel`
The model whose sufficient statistics will be used to update the topics.
extra_pass : bool, optional
Whether this step required an additional pass over the corpus.
"""
logger.debug("updating topics")
# update self with the new blend; also keep track of how much did
# the topics change through this update, to assess convergence
diff = np.log(self.expElogbeta)
self.state.blend(rho, other)
diff -= self.state.get_Elogbeta()
self.sync_state()
# print out some debug info at the end of each EM iteration
self.print_topics(5)
logger.info("topic diff=%f, rho=%f", np.mean(np.abs(diff)), rho)
if self.optimize_eta:
self.update_eta(self.state.get_lambda(), rho)
if not extra_pass:
# only update if this isn't an additional pass
self.num_updates += other.numdocs
def bound(self, corpus, gamma=None, subsample_ratio=1.0):
"""Estimate the variational bound of documents from the corpus as E_q[log p(corpus)] - E_q[log q(corpus)].
Parameters
----------
corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`) used to estimate the
variational bounds.
gamma : numpy.ndarray, optional
Topic weight variational parameters for each document. If not supplied, it will be inferred from the model.
subsample_ratio : float, optional
Percentage of the whole corpus represented by the passed `corpus` argument (in case this was a sample).
Set to 1.0 if the whole corpus was passed.This is used as a multiplicative factor to scale the likelihood
appropriately.
Returns
-------
numpy.ndarray
The variational bound score calculated for each document.
"""
score = 0.0
_lambda = self.state.get_lambda()
Elogbeta = dirichlet_expectation(_lambda)
for d, doc in enumerate(corpus): # stream the input doc-by-doc, in case it's too large to fit in RAM
if d % self.chunksize == 0:
logger.debug("bound: at document #%i", d)
if gamma is None:
gammad, _ = self.inference([doc])
else:
gammad = gamma[d]
Elogthetad = dirichlet_expectation(gammad)
assert gammad.dtype == self.dtype
assert Elogthetad.dtype == self.dtype
# E[log p(doc | theta, beta)]
score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
# E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a vector
score += np.sum((self.alpha - gammad) * Elogthetad)
score += np.sum(gammaln(gammad) - gammaln(self.alpha))
score += gammaln(np.sum(self.alpha)) - gammaln(np.sum(gammad))
# Compensate likelihood for when `corpus` above is only a sample of the whole corpus. This ensures
# that the likelihood is always rougly on the same scale.
score *= subsample_ratio
# E[log p(beta | eta) - log q (beta | lambda)]; assumes eta is a scalar
score += np.sum((self.eta - _lambda) * Elogbeta)
score += np.sum(gammaln(_lambda) - gammaln(self.eta))
if np.ndim(self.eta) == 0:
sum_eta = self.eta * self.num_terms
else:
sum_eta = np.sum(self.eta)
score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1)))
return score
def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
"""Get a representation for selected topics.
Parameters
----------
num_topics : int, optional
Number of topics to be returned. Unlike LSA, there is no natural ordering between the topics in LDA.
The returned topics subset of all topics is therefore arbitrary and may change between two LDA
training runs.
num_words : int, optional
Number of words to be presented for each topic. These will be the most relevant words (assigned the highest
probability for each topic).
log : bool, optional
Whether the output is also logged, besides being returned.
formatted : bool, optional
Whether the topic representations should be formatted as strings. If False, they are returned as
2 tuples of (word, probability).
Returns
-------
list of {str, tuple of (str, float)}
a list of topics, each represented either as a string (when `formatted` == True) or word-probability
pairs.
"""
if num_topics < 0 or num_topics >= self.num_topics:
num_topics = self.num_topics
chosen_topics = range(num_topics)
else:
num_topics = min(num_topics, self.num_topics)
# add a little random jitter, to randomize results around the same alpha
sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha))
# random_state.rand returns float64, but converting back to dtype won't speed up anything
sorted_topics = list(matutils.argsort(sort_alpha))
chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:]
shown = []
topic = self.state.get_lambda()
for i in chosen_topics:
topic_ = topic[i]
topic_ = topic_ / topic_.sum() # normalize to probability distribution
bestn = matutils.argsort(topic_, num_words, reverse=True)
topic_ = [(self.id2word[id], topic_[id]) for id in bestn]
if formatted:
topic_ = ' + '.join(['%.3f*"%s"' % (v, k) for k, v in topic_])
shown.append((i, topic_))
if log:
logger.info("topic #%i (%.3f): %s", i, self.alpha[i], topic_)
return shown
def show_topic(self, topicid, topn=10):
"""Get the representation for a single topic. Words here are the actual strings, in constrast to
:meth:`~gensim.models.ldamodel.LdaModel.get_topic_terms` that represents words by their vocabulary ID.
Parameters
----------
topicid : int
The ID of the topic to be returned
topn : int, optional
Number of the most significant words that are associated with the topic.
Returns
-------
list of (str, float)
Word - probability pairs for the most relevant words generated by the topic.
"""
return [(self.id2word[id], value) for id, value in self.get_topic_terms(topicid, topn)]
def get_topics(self):
"""Get the term-topic matrix learned during inference.
Returns
-------
numpy.ndarray
The probability for each word in each topic, shape (`num_topics`, `vocabulary_size`).
"""
topics = self.state.get_lambda()
return topics / topics.sum(axis=1)[:, None]
def get_topic_terms(self, topicid, topn=10):
"""Get the representation for a single topic. Words the integer IDs, in constrast to
:meth:`~gensim.models.ldamodel.LdaModel.show_topic` that represents words by the actual strings.
Parameters
----------
topicid : int
The ID of the topic to be returned
topn : int, optional
Number of the most significant words that are associated with the topic.
Returns
-------
list of (int, float)
Word ID - probability pairs for the most relevant words generated by the topic.
"""
topic = self.get_topics()[topicid]
topic = topic / topic.sum() # normalize to probability distribution
bestn = matutils.argsort(topic, topn, reverse=True)
return [(idx, topic[idx]) for idx in bestn]
def top_topics(self, corpus=None, texts=None, dictionary=None, window_size=None,
coherence='u_mass', topn=20, processes=-1):
"""Get the topics with the highest coherence score the coherence for each topic.
Parameters
----------
corpus : iterable of list of (int, float), optional
Corpus in BoW format.
texts : list of list of str, optional
Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`)
probability estimator .
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
Gensim dictionary mapping of id word to create corpus.
If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used.
window_size : int, optional
Is the size of the window to be used for coherence measures using boolean sliding window as their
probability estimator. For 'u_mass' this doesn't matter.
If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10.
coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional
Coherence measure to be used.
Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`.
For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus
using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed)
topn : int, optional
Integer corresponding to the number of top words to be extracted from each topic.
processes : int, optional
Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as
num_cpus - 1.
Returns
-------
list of (list of (int, str), float)
Each element in the list is a pair of a topic representation and its coherence score. Topic representations
are distributions of words, represented as a list of pairs of word IDs and their probabilities.
"""
cm = CoherenceModel(
model=self, corpus=corpus, texts=texts, dictionary=dictionary,
window_size=window_size, coherence=coherence, topn=topn,
processes=processes
)
coherence_scores = cm.get_coherence_per_topic()
str_topics = []
for topic in self.get_topics(): # topic = array of vocab_size floats, one per term
bestn = matutils.argsort(topic, topn=topn, reverse=True) # top terms for topic
beststr = [(topic[_id], self.id2word[_id]) for _id in bestn] # membership, token
str_topics.append(beststr) # list of topn (float membership, token) tuples
scored_topics = zip(str_topics, coherence_scores)
return sorted(scored_topics, key=lambda tup: tup[1], reverse=True)
def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None,
per_word_topics=False):
"""Get the topic distribution for the given document.
Parameters
----------
bow : corpus : list of (int, float)
The document in BOW format.
minimum_probability : float
Topics with an assigned probability lower than this threshold will be discarded.
minimum_phi_value : float
f `per_word_topics` is True, this represents a lower bound on the term probabilities that are included.
If set to None, a value of 1e-8 is used to prevent 0s.
per_word_topics : bool
If True, this function will also return two extra lists as explained in the "Returns" section.
Returns
-------
list of (int, float)
Topic distribution for the whole document. Each element in the list is a pair of a topic's id, and
the probability that was assigned to it.
list of (int, list of (int, float), optional
Most probable topics per word. Each element in the list is a pair of a word's id, and a list of
topics sorted by their relevance to this word. Only returned if `per_word_topics` was set to True.
list of (int, list of float), optional
Phi relevance values, multipled by the feature length, for each word-topic combination.
Each element in the list is a pair of a word's id and a list of the phi values between this word and
each topic. Only returned if `per_word_topics` was set to True.
"""
if minimum_probability is None:
minimum_probability = self.minimum_probability
minimum_probability = max(minimum_probability, 1e-8) # never allow zero values in sparse output
if minimum_phi_value is None:
minimum_phi_value = self.minimum_probability
minimum_phi_value = max(minimum_phi_value, 1e-8) # never allow zero values in sparse output
# if the input vector is a corpus, return a transformed corpus
is_corpus, corpus = utils.is_corpus(bow)
if is_corpus:
kwargs = dict(
per_word_topics=per_word_topics,
minimum_probability=minimum_probability,
minimum_phi_value=minimum_phi_value
)
return self._apply(corpus, **kwargs)
gamma, phis = self.inference([bow], collect_sstats=per_word_topics)
topic_dist = gamma[0] / sum(gamma[0]) # normalize distribution
document_topics = [
(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)
if topicvalue >= minimum_probability
]
if not per_word_topics:
return document_topics
word_topic = [] # contains word and corresponding topic
word_phi = [] # contains word and phi values
for word_type, weight in bow:
phi_values = [] # contains (phi_value, topic) pairing to later be sorted
phi_topic = [] # contains topic and corresponding phi value to be returned 'raw' to user
for topic_id in range(0, self.num_topics):
if phis[topic_id][word_type] >= minimum_phi_value:
# appends phi values for each topic for that word
# these phi values are scaled by feature length
phi_values.append((phis[topic_id][word_type], topic_id))
phi_topic.append((topic_id, phis[topic_id][word_type]))
# list with ({word_id => [(topic_0, phi_value), (topic_1, phi_value) ...]).
word_phi.append((word_type, phi_topic))
# sorts the topics based on most likely topic
# returns a list like ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]).
sorted_phi_values = sorted(phi_values, reverse=True)
topics_sorted = [x[1] for x in sorted_phi_values]
word_topic.append((word_type, topics_sorted))
return document_topics, word_topic, word_phi # returns 2-tuple
def get_term_topics(self, word_id, minimum_probability=None):
"""Get the most relevant topics to the given word.
Parameters
----------
word_id : int
The word for which the topic distribution will be computed.
minimum_probability : float, optional
Topics with an assigned probability below this threshold will be discarded.
Returns
-------
list of (int, float)
The relevant topics represented as pairs of their ID and their assigned probability, sorted
by relevance to the given word.
"""
if minimum_probability is None:
minimum_probability = self.minimum_probability
minimum_probability = max(minimum_probability, 1e-8) # never allow zero values in sparse output
# if user enters word instead of id in vocab, change to get id
if isinstance(word_id, str):
word_id = self.id2word.doc2bow([word_id])[0][0]
values = []
for topic_id in range(0, self.num_topics):
if self.expElogbeta[topic_id][word_id] >= minimum_probability:
values.append((topic_id, self.expElogbeta[topic_id][word_id]))
return values
def diff(self, other, distance="kullback_leibler", num_words=100,
n_ann_terms=10, diagonal=False, annotation=True, normed=True):
"""Calculate the difference in topic distributions between two models: `self` and `other`.
Parameters
----------
other : :class:`~gensim.models.ldamodel.LdaModel`
The model which will be compared against the current object.
distance : {'kullback_leibler', 'hellinger', 'jaccard', 'jensen_shannon'}
The distance metric to calculate the difference with.
num_words : int, optional
The number of most relevant words used if `distance == 'jaccard'`. Also used for annotating topics.
n_ann_terms : int, optional
Max number of words in intersection/symmetric difference between topics. Used for annotation.
diagonal : bool, optional
Whether we need the difference between identical topics (the diagonal of the difference matrix).
annotation : bool, optional
Whether the intersection or difference of words between two topics should be returned.
normed : bool, optional
Whether the matrix should be normalized or not.
Returns
-------
numpy.ndarray
A difference matrix. Each element corresponds to the difference between the two topics,
shape (`self.num_topics`, `other.num_topics`)
numpy.ndarray, optional
Annotation matrix where for each pair we include the word from the intersection of the two topics,
and the word from the symmetric difference of the two topics. Only included if `annotation == True`.
Shape (`self.num_topics`, `other_model.num_topics`, 2).
Examples
--------
Get the differences between each pair of topics inferred by two models
>>> from gensim.models.ldamulticore import LdaMulticore
>>> from gensim.test.utils import datapath
>>>
>>> m1, m2 = LdaMulticore.load(datapath("lda_3_0_1_model")), LdaMulticore.load(datapath("ldamodel_python_3_5"))
>>> mdiff, annotation = m1.diff(m2)
>>> topic_diff = mdiff # get matrix with difference for each topic pair from `m1` and `m2`
"""
distances = {
"kullback_leibler": kullback_leibler,
"hellinger": hellinger,
"jaccard": jaccard_distance,
"jensen_shannon": jensen_shannon
}
if distance not in distances:
valid_keys = ", ".join("`{}`".format(x) for x in distances.keys())
raise ValueError("Incorrect distance, valid only {}".format(valid_keys))
if not isinstance(other, self.__class__):
raise ValueError("The parameter `other` must be of type `{}`".format(self.__name__))
distance_func = distances[distance]
d1, d2 = self.get_topics(), other.get_topics()
t1_size, t2_size = d1.shape[0], d2.shape[0]
annotation_terms = None
fst_topics = [{w for (w, _) in self.show_topic(topic, topn=num_words)} for topic in xrange(t1_size)]
snd_topics = [{w for (w, _) in other.show_topic(topic, topn=num_words)} for topic in xrange(t2_size)]
if distance == "jaccard":
d1, d2 = fst_topics, snd_topics
if diagonal:
assert t1_size == t2_size, \
"Both input models should have same no. of topics, " \
"as the diagonal will only be valid in a square matrix"
# initialize z and annotation array
z = np.zeros(t1_size)
if annotation:
annotation_terms = np.zeros(t1_size, dtype=list)
else:
# initialize z and annotation matrix
z = np.zeros((t1_size, t2_size))
if annotation:
annotation_terms = np.zeros((t1_size, t2_size), dtype=list)
# iterate over each cell in the initialized z and annotation
for topic in np.ndindex(z.shape):
topic1 = topic[0]
if diagonal:
topic2 = topic1
else:
topic2 = topic[1]
z[topic] = distance_func(d1[topic1], d2[topic2])
if annotation:
pos_tokens = fst_topics[topic1] & snd_topics[topic2]
neg_tokens = fst_topics[topic1].symmetric_difference(snd_topics[topic2])
pos_tokens = list(pos_tokens)[:min(len(pos_tokens), n_ann_terms)]
neg_tokens = list(neg_tokens)[:min(len(neg_tokens), n_ann_terms)]
annotation_terms[topic] = [pos_tokens, neg_tokens]
if normed:
if np.abs(np.max(z)) > 1e-8:
z /= np.max(z)
return z, annotation_terms
def __getitem__(self, bow, eps=None):
"""Get the topic distribution for the given document.
Wraps :meth:`~gensim.models.ldamodel.LdaModel.get_document_topics` to support an operator style call.
Uses the model's current state (set using constructor arguments) to fill in the additional arguments of the
wrapper method.
Parameters
---------
bow : list of (int, float)
The document in BOW format.
eps : float, optional
Topics with an assigned probability lower than this threshold will be discarded.
Returns
-------
list of (int, float)
Topic distribution for the given document. Each topic is represented as a pair of its ID and the probability
assigned to it.
"""
return self.get_document_topics(bow, eps, self.minimum_phi_value, self.per_word_topics)
def save(self, fname, ignore=('state', 'dispatcher'), separately=None, *args, **kwargs):
"""Save the model to a file.
Large internal arrays may be stored into separate files, with `fname` as prefix.
Notes
-----
If you intend to use models across Python 2/3 versions there are a few things to
keep in mind:
1. The pickled Python dictionaries will not work across Python versions
2. The `save` method does not automatically save all numpy arrays separately, only
those ones that exceed `sep_limit` set in :meth:`~gensim.utils.SaveLoad.save`. The main
concern here is the `alpha` array if for instance using `alpha='auto'`.
Please refer to the `wiki recipes section
<https://github.com/RaRe-Technologies/gensim/wiki/
Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2>`_
for an example on how to work around these issues.
See Also
--------
:meth:`~gensim.models.ldamodel.LdaModel.load`
Load model.
Parameters
----------
fname : str
Path to the system file where the model will be persisted.
ignore : tuple of str, optional
The named attributes in the tuple will be left out of the pickled model. The reason why
the internal `state` is ignored by default is that it uses its own serialisation rather than the one
provided by this method.
separately : {list of str, None}, optional
If None - automatically detect large numpy/scipy.sparse arrays in the object being stored, and store
them into separate files. This avoids pickle memory errors and allows `mmap`'ing large arrays
back on load efficiently. If list of str - this attributes will be stored in separate files,
the automatic check is not performed in this case.
*args
Positional arguments propagated to :meth:`~gensim.utils.SaveLoad.save`.
**kwargs
Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.save`.
"""
if self.state is not None:
self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs)
# Save the dictionary separately if not in 'ignore'.
if 'id2word' not in ignore:
utils.pickle(self.id2word, utils.smart_extension(fname, '.id2word'))
# make sure 'state', 'id2word' and 'dispatcher' are ignored from the pickled object, even if
# someone sets the ignore list themselves
if ignore is not None and ignore:
if isinstance(ignore, six.string_types):
ignore = [ignore]
ignore = [e for e in ignore if e] # make sure None and '' are not in the list
ignore = list({'state', 'dispatcher', 'id2word'} | set(ignore))
else:
ignore = ['state', 'dispatcher', 'id2word']
# make sure 'expElogbeta' and 'sstats' are ignored from the pickled object, even if
# someone sets the separately list themselves.
separately_explicit = ['expElogbeta', 'sstats']
# Also add 'alpha' and 'eta' to separately list if they are set 'auto' or some
# array manually.
if (isinstance(self.alpha, six.string_types) and self.alpha == 'auto') or \
(isinstance(self.alpha, np.ndarray) and len(self.alpha.shape) != 1):
separately_explicit.append('alpha')
if (isinstance(self.eta, six.string_types) and self.eta == 'auto') or \
(isinstance(self.eta, np.ndarray) and len(self.eta.shape) != 1):
separately_explicit.append('eta')
# Merge separately_explicit with separately.
if separately:
if isinstance(separately, six.string_types):
separately = [separately]
separately = [e for e in separately if e] # make sure None and '' are not in the list
separately = list(set(separately_explicit) | set(separately))
else:
separately = separately_explicit
super(LdaModel, self).save(fname, ignore=ignore, separately=separately, *args, **kwargs)
@classmethod
def load(cls, fname, *args, **kwargs):
"""Load a previously saved :class:`gensim.models.ldamodel.LdaModel` from file.
See Also
--------
:meth:`~gensim.models.ldamodel.LdaModel.save`
Save model.
Parameters
----------
fname : str
Path to the file where the model is stored.
*args
Positional arguments propagated to :meth:`~gensim.utils.SaveLoad.load`.
**kwargs
Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.load`.
Examples
--------
Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`:
>>> from gensim.test.utils import datapath
>>>
>>> fname = datapath("lda_3_0_1_model")
>>> lda = LdaModel.load(fname, mmap='r')
"""
kwargs['mmap'] = kwargs.get('mmap', None)
result = super(LdaModel, cls).load(fname, *args, **kwargs)
# check if `random_state` attribute has been set after main pickle load
# if set -> the model to be loaded was saved using a >= 0.13.2 version of Gensim
# if not set -> the model to be loaded was saved using a < 0.13.2 version of Gensim,
# so set `random_state` as the default value
if not hasattr(result, 'random_state'):
result.random_state = utils.get_random_state(None) # using default value `get_random_state(None)`
logging.warning("random_state not set so using default value")
# dtype could be absent in old models
if not hasattr(result, 'dtype'):
result.dtype = np.float64 # float64 was implicitly used before (cause it's default in numpy)
logging.info("dtype was not set in saved %s file %s, assuming np.float64", result.__class__.__name__, fname)
state_fname = utils.smart_extension(fname, '.state')
try:
result.state = LdaState.load(state_fname, *args, **kwargs)
except Exception as e:
logging.warning("failed to load state from %s: %s", state_fname, e)
id2word_fname = utils.smart_extension(fname, '.id2word')
# check if `id2word_fname` file is present on disk
# if present -> the model to be loaded was saved using a >= 0.13.2 version of Gensim,
# so set `result.id2word` using the `id2word_fname` file
# if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim,
# so `result.id2word` already set after the main pickle load
if os.path.isfile(id2word_fname):
try:
result.id2word = utils.unpickle(id2word_fname)
except Exception as e:
logging.warning("failed to load id2word dictionary from %s: %s", id2word_fname, e)
return result