laywerrobot/lib/python3.6/site-packages/gensim/models/ldaseqmodel.py

1639 lines
61 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
# Based on Copyright (C) 2016 Radim Rehurek <radimrehurek@seznam.cz>
"""Lda Sequence model, inspired by `David M. Blei, John D. Lafferty: "Dynamic Topic Models"
<https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_ .
The original C/C++ implementation can be found on `blei-lab/dtm <https://github.com/blei-lab/dtm>`.
TODO: The next steps to take this forward would be:
#. Include DIM mode. Most of the infrastructure for this is in place.
#. See if LdaPost can be replaced by LdaModel completely without breaking anything.
#. Heavy lifting going on in the Sslm class - efforts can be made to cythonise mathematical methods, in particular,
update_obs and the optimization takes a lot time.
#. Try and make it distributed, especially around the E and M step.
#. Remove all C/C++ coding style/syntax.
Examples
--------
Set up a model using have 30 documents, with 5 in the first time-slice, 10 in the second, and 15 in the third
>>> from gensim.test.utils import common_corpus
>>> from gensim.models import LdaSeqModel
>>>
>>> ldaseq = LdaSeqModel(corpus=common_corpus, time_slice=[2, 4, 3], num_topics=2, chunksize=1)
Persist a model to disk and reload it later
>>> from gensim.test.utils import datapath
>>>
>>> temp_file = datapath("model")
>>> ldaseq.save(temp_file)
>>>
>>> # Load a potentially pre-trained model from disk.
>>> ldaseq = LdaSeqModel.load(temp_file)
Access the document embeddings generated from the DTM
>>> doc = common_corpus[1]
>>>
>>> embedding = ldaseq[doc]
"""
from gensim import utils, matutils
from gensim.models import ldamodel
import numpy as np
from scipy.special import digamma, gammaln
from scipy import optimize
import logging
logger = logging.getLogger(__name__)
class LdaSeqModel(utils.SaveLoad):
"""Estimate Dynamic Topic Model parameters based on a training corpus."""
def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10,
initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10,
random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100):
"""
Parameters
----------
corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`).
If not given, the model is left untrained (presumably because you want to call
:meth:`~gensim.models.ldamodel.LdaSeqModel.update` manually).
time_slice : list of int, optional
Number of documents in each time-slice. Each time slice could for example represent a year's published
papers, in case the corpus comes from a journal publishing over multiple years.
It is asummed that `sum(time_slice) == num_topics`.
id2word : dict of (int, str), optional
Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for
debugging and topic printing.
alphas : float, optional
The prior probability for the model.
num_topics : int, optional
The number of requested latent topics to be extracted from the training corpus.
initialize : {'gensim', 'own', 'ldamodel'}, optional
Controls the initialization of the DTM model. Supports three different modes:
* 'gensim': Uses gensim's LDA initialization.
* 'own': Uses your own initialization matrix of an LDA model that has been previously trained.
* 'lda_model': Use a previously used LDA model, passing it through the `lda_model` argument.
sstats : numpy.ndarray , optional
Sufficient statistics used for initializing the model if `initialize == 'own'`. Corresponds to matrix
beta in the linked paper for time slice 0, expected shape (`self.vocab_len`, `num_topics`).
lda_model : :class:`~gensim.models.ldamodel.LdaModel`
Model whose sufficient statistics will be used to initialize the current object if `initialize == 'gensim'`.
obs_variance : float, optional
Observed variance used to approximate the true and forward variance as shown in
`David M. Blei, John D. Lafferty: "Dynamic Topic Models"
<https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
chain_variance : float, optional
Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
passes : int, optional
Number of passes over the corpus for the initial :class:`~gensim.models.ldamodel.LdaModel`
random_state : {numpy.random.RandomState, int}, optional
Can be a np.random.RandomState object, or the seed to generate one. Used for reproducibility of results.
lda_inference_max_iter : int, optional
Maximum number of iterations in the inference step of the LDA training.
em_min_iter : int, optional
Minimum number of iterations until converge of the Expectation-Maximization algorithm
em_max_iter : int, optional
Maximum number of iterations until converge of the Expectation-Maximization algorithm.
chunksize : int, optional
Number of documents in the corpus do be processed in in a chunk.
"""
self.id2word = id2word
if corpus is None and self.id2word is None:
raise ValueError(
'at least one of corpus/id2word must be specified, to establish input space dimensionality'
)
if self.id2word is None:
logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
self.id2word = utils.dict_from_corpus(corpus)
self.vocab_len = len(self.id2word)
elif len(self.id2word) > 0:
self.vocab_len = len(self.id2word)
else:
self.vocab_len = 0
if corpus is not None:
try:
self.corpus_len = len(corpus)
except TypeError:
logger.warning("input corpus stream has no len(); counting documents")
self.corpus_len = sum(1 for _ in corpus)
self.time_slice = time_slice
if self.time_slice is not None:
self.num_time_slices = len(time_slice)
max_doc_len = 0
for line_no, line in enumerate(corpus):
if len(line) > max_doc_len:
max_doc_len = len(line)
self.max_doc_len = max_doc_len
self.num_topics = num_topics
self.num_time_slices = len(time_slice)
self.alphas = np.full(num_topics, alphas)
# topic_chains contains for each topic a 'state space language model' object
# which in turn has information about each topic
# the sslm class is described below and contains information
# on topic-word probabilities and doc-topic probabilities.
self.topic_chains = []
for topic in range(0, num_topics):
sslm_ = sslm(
num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics,
chain_variance=chain_variance, obs_variance=obs_variance
)
self.topic_chains.append(sslm_)
# the following are class variables which are to be integrated during Document Influence Model
self.top_doc_phis = None
self.influence = None
self.renormalized_influence = None
self.influence_sum_lgl = None
# if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM.
if corpus is not None and time_slice is not None:
if initialize == 'gensim':
lda_model = ldamodel.LdaModel(
corpus, id2word=self.id2word, num_topics=self.num_topics,
passes=passes, alpha=self.alphas, random_state=random_state,
dtype=np.float64
)
self.sstats = np.transpose(lda_model.state.sstats)
if initialize == 'ldamodel':
self.sstats = np.transpose(lda_model.state.sstats)
if initialize == 'own':
self.sstats = sstats
# initialize model from sstats
self.init_ldaseq_ss(chain_variance, obs_variance, self.alphas, self.sstats)
# fit DTM
self.fit_lda_seq(corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize)
def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_suffstats):
"""Initialize State Space Language Model, topic-wise.
Parameters
----------
topic_chain_variance : float
Gaussian parameter defined in the beta distribution to dictate how the beta values evolve.
topic_obs_variance : float
Observed variance used to approximate the true and forward variance as shown in
`David M. Blei, John D. Lafferty: "Dynamic Topic Models"
<https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
alpha : float
The prior probability for the model.
init_suffstats : numpy.ndarray
Sufficient statistics used for initializing the model, expected shape (`self.vocab_len`, `num_topics`).
"""
self.alphas = alpha
for k, chain in enumerate(self.topic_chains):
sstats = init_suffstats[:, k]
sslm.sslm_counts_init(chain, topic_obs_variance, topic_chain_variance, sstats)
# initialize the below matrices only if running DIM
# ldaseq.topic_chains[k].w_phi_l = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
# ldaseq.topic_chains[k].w_phi_sum = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
# ldaseq.topic_chains[k].w_phi_sq = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize):
"""Fit a LDA Sequence model (DTM).
This method will iteratively setup LDA models and perform EM steps until the sufficient statistics convergence,
or until the maximum number of iterations is reached. Because the true posterior is intractable, an
appropriately tight lower bound must be used instead. This function will optimize this bound, by minimizing
its true Kullback-Liebler Divergence with the true posterior.
Parameters
----------
corpus : {iterable of list of (int, float), scipy.sparse.csc}
Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`).
lda_inference_max_iter : int
Maximum number of iterations for the inference step of LDA.
em_min_iter : int
Minimum number of time slices to be inspected.
em_max_iter : int
Maximum number of time slices to be inspected.
chunksize : int
Number of documents to be processed in each chunk.
Returns
-------
float
The highest lower bound for the true posterior produced after all iterations.
"""
LDASQE_EM_THRESHOLD = 1e-4
# if bound is low, then we increase iterations.
LOWER_ITER = 10
ITER_MULT_LOW = 2
MAX_ITER = 500
num_topics = self.num_topics
vocab_len = self.vocab_len
data_len = self.num_time_slices
corpus_len = self.corpus_len
bound = 0
convergence = LDASQE_EM_THRESHOLD + 1
iter_ = 0
while iter_ < em_min_iter or ((convergence > LDASQE_EM_THRESHOLD) and iter_ <= em_max_iter):
logger.info(" EM iter %i", iter_)
logger.info("E Step")
# TODO: bound is initialized to 0
old_bound = bound
# initiate sufficient statistics
topic_suffstats = []
for topic in range(0, num_topics):
topic_suffstats.append(np.resize(np.zeros(vocab_len * data_len), (vocab_len, data_len)))
# set up variables
gammas = np.resize(np.zeros(corpus_len * num_topics), (corpus_len, num_topics))
lhoods = np.resize(np.zeros(corpus_len * num_topics + 1), (corpus_len, num_topics + 1))
# compute the likelihood of a sequential corpus under an LDA
# seq model and find the evidence lower bound. This is the E - Step
bound, gammas = \
self.lda_seq_infer(corpus, topic_suffstats, gammas, lhoods, iter_, lda_inference_max_iter, chunksize)
self.gammas = gammas
logger.info("M Step")
# fit the variational distribution. This is the M - Step
topic_bound = self.fit_lda_seq_topics(topic_suffstats)
bound += topic_bound
if (bound - old_bound) < 0:
# if max_iter is too low, increase iterations.
if lda_inference_max_iter < LOWER_ITER:
lda_inference_max_iter *= ITER_MULT_LOW
logger.info("Bound went down, increasing iterations to %i", lda_inference_max_iter)
# check for convergence
convergence = np.fabs((bound - old_bound) / old_bound)
if convergence < LDASQE_EM_THRESHOLD:
lda_inference_max_iter = MAX_ITER
logger.info("Starting final iterations, max iter is %i", lda_inference_max_iter)
convergence = 1.0
logger.info("iteration %i iteration lda seq bound is %f convergence is %f", iter_, bound, convergence)
iter_ += 1
return bound
def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods,
iter_, lda_inference_max_iter, chunksize):
"""Inference (or E-step) for the lower bound EM optimization.
This is used to set up the gensim :class:`~gensim.models.ldamodel.LdaModel` to be used for each time-slice.
It also allows for Document Influence Model code to be written in.
Parameters
----------
corpus : {iterable of list of (int, float), scipy.sparse.csc}
Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`).
topic_suffstats : numpy.ndarray
Sufficient statistics for time slice 0, used for initializing the model if `initialize == 'own'`,
expected shape (`self.vocab_len`, `num_topics`).
gammas : numpy.ndarray
Topic weight variational parameters for each document. If not supplied, it will be inferred from the model.
lhoods : list of float
The total log probability lower bound for each topic. Corresponds to the phi variational parameters in the
linked paper.
iter_ : int
Current iteration.
lda_inference_max_iter : int
Maximum number of iterations for the inference step of LDA.
chunksize : int
Number of documents to be processed in each chunk.
Returns
-------
(float, list of float)
The first value is the highest lower bound for the true posterior.
The second value is the list of optimized dirichlet variational parameters for the approximation of
the posterior.
"""
num_topics = self.num_topics
vocab_len = self.vocab_len
bound = 0.0
lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
lda.topics = np.array(np.split(np.zeros(vocab_len * num_topics), vocab_len))
ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=num_topics, lda=lda)
model = "DTM"
if model == "DTM":
bound, gammas = self.inferDTMseq(
corpus, topic_suffstats, gammas, lhoods, lda,
ldapost, iter_, bound, lda_inference_max_iter, chunksize
)
elif model == "DIM":
self.InfluenceTotalFixed(corpus)
bound, gammas = self.inferDIMseq(
corpus, topic_suffstats, gammas, lhoods, lda,
ldapost, iter_, bound, lda_inference_max_iter, chunksize
)
return bound, gammas
def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda,
ldapost, iter_, bound, lda_inference_max_iter, chunksize):
"""Compute the likelihood of a sequential corpus under an LDA seq model, and reports the likelihood bound.
Parameters
----------
corpus : {iterable of list of (int, float), scipy.sparse.csc}
Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`).
topic_suffstats : numpy.ndarray
Sufficient statistics of the current model, expected shape (`self.vocab_len`, `num_topics`).
gammas : numpy.ndarray
Topic weight variational parameters for each document. If not supplied, it will be inferred from the model.
lhoods : list of float of length `self.num_topics`
The total log probability bound for each topic. Corresponds to phi from the linked paper.
lda : :class:`~gensim.models.ldamodel.LdaModel`
The trained LDA model of the previous iteration.
ldapost : :class:`~gensim.models.ldaseqmodel.LdaPost`
Posterior probability variables for the given LDA model. This will be used as the true (but intractable)
posterior.
iter_ : int
The current iteration.
bound : float
The LDA bound produced after all iterations.
lda_inference_max_iter : int
Maximum number of iterations for the inference step of LDA.
chunksize : int
Number of documents to be processed in each chunk.
Returns
-------
(float, list of float)
The first value is the highest lower bound for the true posterior.
The second value is the list of optimized dirichlet variational parameters for the approximation of
the posterior.
"""
doc_index = 0 # overall doc_index in corpus
time = 0 # current time-slice
doc_num = 0 # doc-index in current time-slice
lda = self.make_lda_seq_slice(lda, time) # create lda_seq slice
time_slice = np.cumsum(np.array(self.time_slice))
for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
# iterates chunk size for constant memory footprint
for doc in chunk:
# this is used to update the time_slice and create a new lda_seq slice every new time_slice
if doc_index > time_slice[time]:
time += 1
lda = self.make_lda_seq_slice(lda, time) # create lda_seq slice
doc_num = 0
gam = gammas[doc_index]
lhood = lhoods[doc_index]
ldapost.gamma = gam
ldapost.lhood = lhood
ldapost.doc = doc
# TODO: replace fit_lda_post with appropriate ldamodel functions, if possible.
if iter_ == 0:
doc_lhood = LdaPost.fit_lda_post(
ldapost, doc_num, time, None, lda_inference_max_iter=lda_inference_max_iter
)
else:
doc_lhood = LdaPost.fit_lda_post(
ldapost, doc_num, time, self, lda_inference_max_iter=lda_inference_max_iter
)
if topic_suffstats is not None:
topic_suffstats = LdaPost.update_lda_seq_ss(ldapost, time, doc, topic_suffstats)
gammas[doc_index] = ldapost.gamma
bound += doc_lhood
doc_index += 1
doc_num += 1
return bound, gammas
def make_lda_seq_slice(self, lda, time):
"""Update the LDA model topic-word values using time slices.
Parameters
----------
lda : :class:`~gensim.models.ldamodel.LdaModel`
The stationary model to be updated
time : int
The time slice assigned to the stationary model.
Returns
-------
lda : :class:`~gensim.models.ldamodel.LdaModel`
The stationary model updated to reflect the passed time slice.
"""
for k in range(0, self.num_topics):
lda.topics[:, k] = np.copy(self.topic_chains[k].e_log_prob[:, time])
lda.alpha = np.copy(self.alphas)
return lda
def fit_lda_seq_topics(self, topic_suffstats):
"""Fit the sequential model topic-wise.
Parameters
----------
topic_suffstats : numpy.ndarray
Sufficient statistics of the current model, expected shape (`self.vocab_len`, `num_topics`).
Returns
-------
float
The sum of the optimized lower bounds for all topics.
"""
lhood = 0
for k, chain in enumerate(self.topic_chains):
logger.info("Fitting topic number %i", k)
lhood_term = sslm.fit_sslm(chain, topic_suffstats[k])
lhood += lhood_term
return lhood
def print_topic_times(self, topic, top_terms=20):
"""Get the most relevant words for a topic, for each timeslice. This can be used to inspect the evolution of a
topic through time.
Parameters
----------
topic : int
The index of the topic.
top_terms : int, optional
Number of most relevant words associated with the topic to be returned.
Returns
-------
list of list of str
Top `top_terms` relevant terms for the topic for each time slice.
"""
topics = []
for time in range(0, self.num_time_slices):
topics.append(self.print_topic(topic, time, top_terms))
return topics
def print_topics(self, time=0, top_terms=20):
"""Get the most relevant words for every topic.
Parameters
----------
time : int, optional
The time slice in which we are interested in (since topics evolve over time, it is expected that the most
relevant words will also gradually change).
top_terms : int, optional
Number of most relevant words to be returned for each topic.
Returns
-------
list of list of (str, float)
Representation of all topics. Each of them is represented by a list of pairs of words and their assigned
probability.
"""
return [self.print_topic(topic, time, top_terms) for topic in range(0, self.num_topics)]
def print_topic(self, topic, time=0, top_terms=20):
"""Get the list of words most relevant to the given topic.
Parameters
----------
topic : int
The index of the topic to be inspected.
time : int, optional
The time slice in which we are interested in (since topics evolve over time, it is expected that the most
relevant words will also gradually change).
top_terms : int, optional
Number of words associated with the topic to be returned.
Returns
-------
list of (str, float)
The representation of this topic. Each element in the list includes the word itself, along with the
probability assigned to it by the topic.
"""
topic = self.topic_chains[topic].e_log_prob
topic = np.transpose(topic)
topic = np.exp(topic[time])
topic = topic / topic.sum()
bestn = matutils.argsort(topic, top_terms, reverse=True)
beststr = [(self.id2word[id_], topic[id_]) for id_ in bestn]
return beststr
def doc_topics(self, doc_number):
"""Get the topic mixture for a document.
Uses the priors for the dirichlet distribution that approximates the true posterior with the optimal
lower bound, and therefore requires the model to be already trained.
Parameters
----------
doc_number : int
Index of the document for which the mixture is returned.
Returns
-------
list of length `self.num_topics`
Probability for each topic in the mixture (essentially a point in the `self.num_topics - 1` simplex.
"""
doc_topic = np.copy(self.gammas)
doc_topic /= doc_topic.sum(axis=1)[:, np.newaxis]
return doc_topic[doc_number]
def dtm_vis(self, time, corpus):
"""Get the information needed to visualize the corpus model at a given time slice, using the pyLDAvis format.
Parameters
----------
time : int
The time slice we are interested in.
corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
The corpus we want to visualize at the given time slice.
Returns
-------
doc_topics : list of length `self.num_topics`
Probability for each topic in the mixture (essentially a point in the `self.num_topics - 1` simplex.
topic_term : numpy.ndarray
The representation of each topic as a multinomial over words in the vocabulary,
expected shape (`num_topics`, vocabulary length).
doc_lengths : list of int
The number of words in each document. These could be fixed, or drawn from a Poisson distribution.
term_frequency : numpy.ndarray
The term frequency matrix (denoted as beta in the original Blei paper). This could also be the TF-IDF
representation of the corpus, expected shape (number of documents, length of vocabulary).
vocab : list of str
The set of unique terms existing in the cropuse's vocabulary.
"""
doc_topic = np.copy(self.gammas)
doc_topic /= doc_topic.sum(axis=1)[:, np.newaxis]
topic_term = [
np.exp(np.transpose(chain.e_log_prob)[time]) / np.exp(np.transpose(chain.e_log_prob)[time]).sum()
for k, chain in enumerate(self.topic_chains)
]
doc_lengths = [len(doc) for doc_no, doc in enumerate(corpus)]
term_frequency = np.zeros(self.vocab_len)
for doc_no, doc in enumerate(corpus):
for pair in doc:
term_frequency[pair[0]] += pair[1]
vocab = [self.id2word[i] for i in range(0, len(self.id2word))]
return doc_topic, np.array(topic_term), doc_lengths, term_frequency, vocab
def dtm_coherence(self, time):
"""Get the coherence for each topic.
Can be used to measure the quality of the model, or to inspect the convergence through training via a callback.
Parameters
----------
time : int
The time slice.
Returns
-------
list of list of str
The word representation for each topic, for each time slice. This can be used to check the time coherence
of topics as time evolves: If the most relevant words remain the same then the topic has somehow
converged or is relatively static, if they change rapidly the topic is evolving.
"""
coherence_topics = []
for topics in self.print_topics(time):
coherence_topic = []
for word, dist in topics:
coherence_topic.append(word)
coherence_topics.append(coherence_topic)
return coherence_topics
def __getitem__(self, doc):
"""Get the topic mixture for the given document, using the inferred approximation of the true posterior.
Parameters
----------
doc : list of (int, float)
The doc in BOW format. Can be an unseen document.
Returns
-------
list of float
Probabilities for each topic in the mixture. This is essentially a point in the `num_topics - 1` simplex.
"""
lda_model = \
ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
lda_model.topics = np.array(np.split(np.zeros(self.vocab_len * self.num_topics), self.vocab_len))
ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc)
time_lhoods = []
for time in range(0, self.num_time_slices):
lda_model = self.make_lda_seq_slice(lda_model, time) # create lda_seq slice
lhood = LdaPost.fit_lda_post(ldapost, 0, time, self)
time_lhoods.append(lhood)
doc_topic = ldapost.gamma / ldapost.gamma.sum()
# should even the likelihoods be returned?
return doc_topic
class sslm(utils.SaveLoad):
"""Encapsulate the inner State Space Language Model for DTM.
Some important attributes of this class:
* `obs` is a matrix containing the document to topic ratios.
* `e_log_prob` is a matrix containing the topic to word ratios.
* `mean` contains the mean values to be used for inference for each word for a time slice.
* `variance` contains the variance values to be used for inference of word in a time slice.
* `fwd_mean` and`fwd_variance` are the forward posterior values for the mean and the variance.
* `zeta` is an extra variational parameter with a value for each time slice.
"""
def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_variance=0.5, chain_variance=0.005):
self.vocab_len = vocab_len
self.num_time_slices = num_time_slices
self.obs_variance = obs_variance
self.chain_variance = chain_variance
self.num_topics = num_topics
# setting up matrices
self.obs = np.array(np.split(np.zeros(num_time_slices * vocab_len), vocab_len))
self.e_log_prob = np.array(np.split(np.zeros(num_time_slices * vocab_len), vocab_len))
self.mean = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
self.fwd_mean = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
self.fwd_variance = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
self.variance = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
self.zeta = np.zeros(num_time_slices)
# the following are class variables which are to be integrated during Document Influence Model
self.m_update_coeff = None
self.mean_t = None
self.variance_t = None
self.influence_sum_lgl = None
self.w_phi_l = None
self.w_phi_sum = None
self.w_phi_l_sq = None
self.m_update_coeff_g = None
def update_zeta(self):
"""Update the Zeta variational parameter.
Zeta is described in the appendix and is equal to sum (exp(mean[word] + Variance[word] / 2)),
over every time-slice. It is the value of variational parameter zeta which maximizes the lower bound.
Returns
-------
list of float
The updated zeta values for each time slice.
"""
for j, val in enumerate(self.zeta):
self.zeta[j] = np.sum(np.exp(self.mean[:, j + 1] + self.variance[:, j + 1] / 2))
return self.zeta
def compute_post_variance(self, word, chain_variance):
"""Get the variance, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
<https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
This function accepts the word to compute variance for, along with the associated sslm class object,
and returns the `variance` and the posterior approximation `fwd_variance`.
Notes
-----
This function essentially computes Var[\beta_{t,w}] for t = 1:T
.. :math::
fwd\_variance[t] \equiv E((beta_{t,w}-mean_{t,w})^2 |beta_{t}\ for\ 1:t) =
(obs\_variance / fwd\_variance[t - 1] + chain\_variance + obs\_variance ) *
(fwd\_variance[t - 1] + obs\_variance)
.. :math::
variance[t] \equiv E((beta_{t,w}-mean\_cap_{t,w})^2 |beta\_cap_{t}\ for\ 1:t) =
fwd\_variance[t - 1] + (fwd\_variance[t - 1] / fwd\_variance[t - 1] + obs\_variance)^2 *
(variance[t - 1] - (fwd\_variance[t-1] + obs\_variance))
Parameters
----------
word: int
The word's ID.
chain_variance : float
Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
Returns
-------
(numpy.ndarray, numpy.ndarray)
The first returned value is the variance of each word in each time slice, the second value is the
inferred posterior variance for the same pairs.
"""
INIT_VARIANCE_CONST = 1000
T = self.num_time_slices
variance = self.variance[word]
fwd_variance = self.fwd_variance[word]
# forward pass. Set initial variance very high
fwd_variance[0] = chain_variance * INIT_VARIANCE_CONST
for t in range(1, T + 1):
if self.obs_variance:
c = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance)
else:
c = 0
fwd_variance[t] = c * (fwd_variance[t - 1] + chain_variance)
# backward pass
variance[T] = fwd_variance[T]
for t in range(T - 1, -1, -1):
if fwd_variance[t] > 0.0:
c = np.power((fwd_variance[t] / (fwd_variance[t] + chain_variance)), 2)
else:
c = 0
variance[t] = (c * (variance[t + 1] - chain_variance)) + ((1 - c) * fwd_variance[t])
return variance, fwd_variance
def compute_post_mean(self, word, chain_variance):
"""Get the mean, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
<https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
Notes
-----
This function essentially computes E[\beta_{t,w}] for t = 1:T.
.. :math::
Fwd_Mean(t) E(beta_{t,w} | beta_ˆ 1:t )
= (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * fwd_mean[t - 1] +
(1 - (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance)) * beta
.. :math::
Mean(t) E(beta_{t,w} | beta_ˆ 1:T )
= fwd_mean[t - 1] + (obs_variance / fwd_variance[t - 1] + obs_variance) +
(1 - obs_variance / fwd_variance[t - 1] + obs_variance)) * mean[t]
Parameters
----------
word: int
The word's ID.
chain_variance : float
Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
Returns
-------
(numpy.ndarray, numpy.ndarray)
The first returned value is the mean of each word in each time slice, the second value is the
inferred posterior mean for the same pairs.
"""
T = self.num_time_slices
obs = self.obs[word]
fwd_variance = self.fwd_variance[word]
mean = self.mean[word]
fwd_mean = self.fwd_mean[word]
# forward
fwd_mean[0] = 0
for t in range(1, T + 1):
c = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance)
fwd_mean[t] = c * fwd_mean[t - 1] + (1 - c) * obs[t - 1]
# backward pass
mean[T] = fwd_mean[T]
for t in range(T - 1, -1, -1):
if chain_variance == 0.0:
c = 0.0
else:
c = chain_variance / (fwd_variance[t] + chain_variance)
mean[t] = c * fwd_mean[t] + (1 - c) * mean[t + 1]
return mean, fwd_mean
def compute_expected_log_prob(self):
"""Compute the expected log probability given values of m.
The appendix describes the Expectation of log-probabilities in equation 5 of the DTM paper;
The below implementation is the result of solving the equation and is implemented as in the original
Blei DTM code.
Returns
-------
numpy.ndarray of float
The expected value for the log probabilities for each word and time slice.
"""
for (w, t), val in np.ndenumerate(self.e_log_prob):
self.e_log_prob[w][t] = self.mean[w][t + 1] - np.log(self.zeta[t])
return self.e_log_prob
def sslm_counts_init(self, obs_variance, chain_variance, sstats):
"""Initialize the State Space Language Model with LDA sufficient statistics.
Called for each topic-chain and initializes initial mean, variance and Topic-Word probabilities
for the first time-slice.
Parameters
----------
obs_variance : float, optional
Observed variance used to approximate the true and forward variance.
chain_variance : float
Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
sstats : numpy.ndarray
Sufficient statistics of the LDA model. Corresponds to matrix beta in the linked paper for time slice 0,
expected shape (`self.vocab_len`, `num_topics`).
"""
W = self.vocab_len
T = self.num_time_slices
log_norm_counts = np.copy(sstats)
log_norm_counts = log_norm_counts / sum(log_norm_counts)
log_norm_counts = log_norm_counts + 1.0 / W
log_norm_counts = log_norm_counts / sum(log_norm_counts)
log_norm_counts = np.log(log_norm_counts)
# setting variational observations to transformed counts
self.obs = (np.repeat(log_norm_counts, T, axis=0)).reshape(W, T)
# set variational parameters
self.obs_variance = obs_variance
self.chain_variance = chain_variance
# compute post variance, mean
for w in range(0, W):
self.variance[w], self.fwd_variance[w] = self.compute_post_variance(w, self.chain_variance)
self.mean[w], self.fwd_mean[w] = self.compute_post_mean(w, self.chain_variance)
self.zeta = self.update_zeta()
self.e_log_prob = self.compute_expected_log_prob()
def fit_sslm(self, sstats):
"""Fits variational distribution.
This is essentially the m-step.
Maximizes the approximation of the true posterior for a particular topic using the provided sufficient
statistics. Updates the values using :meth:`~gensim.models.ldaseqmodel.sslm.update_obs` and
:meth:`~gensim.models.ldaseqmodel.sslm.compute_expected_log_prob`.
Parameters
----------
sstats : numpy.ndarray
Sufficient statistics for a particular topic. Corresponds to matrix beta in the linked paper for the
current time slice, expected shape (`self.vocab_len`, `num_topics`).
Returns
-------
float
The lower bound for the true posterior achieved using the fitted approximate distribution.
"""
W = self.vocab_len
bound = 0
old_bound = 0
sslm_fit_threshold = 1e-6
sslm_max_iter = 2
converged = sslm_fit_threshold + 1
# computing variance, fwd_variance
self.variance, self.fwd_variance = \
(np.array(x) for x in list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)])))
# column sum of sstats
totals = sstats.sum(axis=0)
iter_ = 0
model = "DTM"
if model == "DTM":
bound = self.compute_bound(sstats, totals)
if model == "DIM":
bound = self.compute_bound_fixed(sstats, totals)
logger.info("initial sslm bound is %f", bound)
while converged > sslm_fit_threshold and iter_ < sslm_max_iter:
iter_ += 1
old_bound = bound
self.obs, self.zeta = self.update_obs(sstats, totals)
if model == "DTM":
bound = self.compute_bound(sstats, totals)
if model == "DIM":
bound = self.compute_bound_fixed(sstats, totals)
converged = np.fabs((bound - old_bound) / old_bound)
logger.info("iteration %i iteration lda seq bound is %f convergence is %f", iter_, bound, converged)
self.e_log_prob = self.compute_expected_log_prob()
return bound
def compute_bound(self, sstats, totals):
"""Compute the maximized lower bound achieved for the log probability of the true posterior.
Uses the formula presented in the appendix of the DTM paper (formula no. 5).
Parameters
----------
sstats : numpy.ndarray
Sufficient statistics for a particular topic. Corresponds to matrix beta in the linked paper for the first
time slice, expected shape (`self.vocab_len`, `num_topics`).
totals : list of int of length `len(self.time_slice)`
The totals for each time slice.
Returns
-------
float
The maximized lower bound.
"""
w = self.vocab_len
t = self.num_time_slices
term_1 = 0
term_2 = 0
term_3 = 0
val = 0
ent = 0
chain_variance = self.chain_variance
# computing mean, fwd_mean
self.mean, self.fwd_mean = \
(np.array(x) for x in zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, w)]))
self.zeta = self.update_zeta()
for w in range(0, w):
val += (self.variance[w][0] - self.variance[w][t]) / 2 * chain_variance
logger.info("Computing bound, all times")
for t in range(1, t + 1):
term_1 = 0.0
term_2 = 0.0
ent = 0.0
for w in range(0, w):
m = self.mean[w][t]
prev_m = self.mean[w][t - 1]
v = self.variance[w][t]
# w_phi_l is only used in Document Influence Model; the values are always zero in this case
# w_phi_l = sslm.w_phi_l[w][t - 1]
# exp_i = np.exp(-prev_m)
# term_1 += (np.power(m - prev_m - (w_phi_l * exp_i), 2) / (2 * chain_variance)) -
# (v / chain_variance) - np.log(chain_variance)
term_1 += \
(np.power(m - prev_m, 2) / (2 * chain_variance)) - (v / chain_variance) - np.log(chain_variance)
term_2 += sstats[w][t - 1] * m
ent += np.log(v) / 2 # note the 2pi's cancel with term1 (see doc)
term_3 = -totals[t - 1] * np.log(self.zeta[t - 1])
val += term_2 + term_3 + ent - term_1
return val
def update_obs(self, sstats, totals):
"""Optimize the bound with respect to the observed variables.
TODO:
This is by far the slowest function in the whole algorithm.
Replacing or improving the performance of this would greatly speed things up.
Parameters
----------
sstats : numpy.ndarray
Sufficient statistics for a particular topic. Corresponds to matrix beta in the linked paper for the first
time slice, expected shape (`self.vocab_len`, `num_topics`).
totals : list of int of length `len(self.time_slice)`
The totals for each time slice.
Returns
-------
(numpy.ndarray of float, numpy.ndarray of float)
The updated optimized values for obs and the zeta variational parameter.
"""
OBS_NORM_CUTOFF = 2
STEP_SIZE = 0.01
TOL = 1e-3
W = self.vocab_len
T = self.num_time_slices
runs = 0
mean_deriv_mtx = np.resize(np.zeros(T * (T + 1)), (T, T + 1))
norm_cutoff_obs = None
for w in range(0, W):
w_counts = sstats[w]
counts_norm = 0
# now we find L2 norm of w_counts
for i in range(0, len(w_counts)):
counts_norm += w_counts[i] * w_counts[i]
counts_norm = np.sqrt(counts_norm)
if counts_norm < OBS_NORM_CUTOFF and norm_cutoff_obs is not None:
obs = self.obs[w]
norm_cutoff_obs = np.copy(obs)
else:
if counts_norm < OBS_NORM_CUTOFF:
w_counts = np.zeros(len(w_counts))
# TODO: apply lambda function
for t in range(0, T):
mean_deriv = mean_deriv_mtx[t]
mean_deriv = self.compute_mean_deriv(w, t, mean_deriv)
mean_deriv_mtx[t] = mean_deriv
deriv = np.zeros(T)
args = self, w_counts, totals, mean_deriv_mtx, w, deriv
obs = self.obs[w]
model = "DTM"
if model == "DTM":
# slowest part of method
obs = optimize.fmin_cg(
f=f_obs, fprime=df_obs, x0=obs, gtol=TOL, args=args, epsilon=STEP_SIZE, disp=0
)
if model == "DIM":
pass
runs += 1
if counts_norm < OBS_NORM_CUTOFF:
norm_cutoff_obs = obs
self.obs[w] = obs
self.zeta = self.update_zeta()
return self.obs, self.zeta
def compute_mean_deriv(self, word, time, deriv):
"""Helper functions for optimizing a function.
Compute the derivative of:
.. :math::
E[\beta_{t,w}]/d obs_{s,w} for t = 1:T.
Parameters
----------
word : int
The word's ID.
time : int
The time slice.
deriv : list of float
Derivative for each time slice.
Returns
-------
list of float
Mean derivative for each time slice.
"""
T = self.num_time_slices
fwd_variance = self.variance[word]
deriv[0] = 0
# forward pass
for t in range(1, T + 1):
if self.obs_variance > 0.0:
w = self.obs_variance / (fwd_variance[t - 1] + self.chain_variance + self.obs_variance)
else:
w = 0.0
val = w * deriv[t - 1]
if time == t - 1:
val += (1 - w)
deriv[t] = val
for t in range(T - 1, -1, -1):
if self.chain_variance == 0.0:
w = 0.0
else:
w = self.chain_variance / (fwd_variance[t] + self.chain_variance)
deriv[t] = w * deriv[t] + (1 - w) * deriv[t + 1]
return deriv
def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv):
"""Derivation of obs which is used in derivative function `df_obs` while optimizing.
Parameters
----------
word : int
The word's ID.
word_counts : list of int
Total word counts for each time slice.
totals : list of int of length `len(self.time_slice)`
The totals for each time slice.
mean_deriv_mtx : list of float
Mean derivative for each time slice.
deriv : list of float
Mean derivative for each time slice.
Returns
-------
list of float
Mean derivative for each time slice.
"""
# flag
init_mult = 1000
T = self.num_time_slices
mean = self.mean[word]
variance = self.variance[word]
# only used for DIM mode
# w_phi_l = self.w_phi_l[word]
# m_update_coeff = self.m_update_coeff[word]
# temp_vector holds temporary zeta values
self.temp_vect = np.zeros(T)
for u in range(0, T):
self.temp_vect[u] = np.exp(mean[u + 1] + variance[u + 1] / 2)
for t in range(0, T):
mean_deriv = mean_deriv_mtx[t]
term1 = 0
term2 = 0
term3 = 0
term4 = 0
for u in range(1, T + 1):
mean_u = mean[u]
mean_u_prev = mean[u - 1]
dmean_u = mean_deriv[u]
dmean_u_prev = mean_deriv[u - 1]
term1 += (mean_u - mean_u_prev) * (dmean_u - dmean_u_prev)
term2 += (word_counts[u - 1] - (totals[u - 1] * self.temp_vect[u - 1] / self.zeta[u - 1])) * dmean_u
model = "DTM"
if model == "DIM":
# do some stuff
pass
if self.chain_variance:
term1 = - (term1 / self.chain_variance)
term1 = term1 - (mean[0] * mean_deriv[0]) / (init_mult * self.chain_variance)
else:
term1 = 0.0
deriv[t] = term1 + term2 + term3 + term4
return deriv
class LdaPost(utils.SaveLoad):
"""Posterior values associated with each set of documents.
TODO: use **Hoffman, Blei, Bach: Online Learning for Latent Dirichlet Allocation, NIPS 2010.**
to update phi, gamma. End game would be to somehow replace LdaPost entirely with LdaModel.
"""
def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma=None, lhood=None):
"""Initialize the posterior value structure for the given LDA model.
Parameters
----------
doc : list of (int, int)
A BOW representation of the document. Each element in the list is a pair of a word's ID and its number
of occurences in the document.
lda : :class:`~gensim.models.ldamodel.LdaModel`, optional
The underlying LDA model.
max_doc_len : int, optional
The maximum number of words in a document.
num_topics : int, optional
Number of topics discovered by the LDA model.
gamma : numpy.ndarray, optional
Topic weight variational parameters for each document. If not supplied, it will be inferred from the model.
lhood : float, optional
The log likelihood lower bound.
"""
self.doc = doc
self.lda = lda
self.gamma = gamma
self.lhood = lhood
if self.gamma is None:
self.gamma = np.zeros(num_topics)
if self.lhood is None:
self.lhood = np.zeros(num_topics + 1)
if max_doc_len is not None and num_topics is not None:
self.phi = np.resize(np.zeros(max_doc_len * num_topics), (max_doc_len, num_topics))
self.log_phi = np.resize(np.zeros(max_doc_len * num_topics), (max_doc_len, num_topics))
# the following are class variables which are to be integrated during Document Influence Model
self.doc_weight = None
self.renormalized_doc_weight = None
def update_phi(self, doc_number, time):
"""Update variational multinomial parameters, based on a document and a time-slice.
This is done based on the original Blei-LDA paper, where:
log_phi := beta * exp(Ψ(gamma)), over every topic for every word.
TODO: incorporate lee-sueng trick used in
**Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**.
Parameters
----------
doc_number : int
Document number. Unused.
time : int
Time slice. Unused.
Returns
-------
(list of float, list of float)
Multinomial parameters, and their logarithm, for each word in the document.
"""
num_topics = self.lda.num_topics
# digamma values
dig = np.zeros(num_topics)
for k in range(0, num_topics):
dig[k] = digamma(self.gamma[k])
n = 0 # keep track of iterations for phi, log_phi
for word_id, count in self.doc:
for k in range(0, num_topics):
self.log_phi[n][k] = dig[k] + self.lda.topics[word_id][k]
log_phi_row = self.log_phi[n]
phi_row = self.phi[n]
# log normalize
v = log_phi_row[0]
for i in range(1, len(log_phi_row)):
v = np.logaddexp(v, log_phi_row[i])
# subtract every element by v
log_phi_row = log_phi_row - v
phi_row = np.exp(log_phi_row)
self.log_phi[n] = log_phi_row
self.phi[n] = phi_row
n += 1 # increase iteration
return self.phi, self.log_phi
def update_gamma(self):
"""Update variational dirichlet parameters.
This operations is described in the original Blei LDA paper:
gamma = alpha + sum(phi), over every topic for every word.
Returns
-------
list of float
The updated gamma parameters for each word in the document.
"""
self.gamma = np.copy(self.lda.alpha)
n = 0 # keep track of number of iterations for phi, log_phi
for word_id, count in self.doc:
phi_row = self.phi[n]
for k in range(0, self.lda.num_topics):
self.gamma[k] += phi_row[k] * count
n += 1
return self.gamma
def init_lda_post(self):
"""Initialize variational posterior. """
total = sum(count for word_id, count in self.doc)
self.gamma.fill(self.lda.alpha[0] + float(total) / self.lda.num_topics)
self.phi[:len(self.doc), :] = 1.0 / self.lda.num_topics
# doc_weight used during DIM
# ldapost.doc_weight = None
def compute_lda_lhood(self):
"""Compute the log likelihood bound.
Returns
-------
float
The optimal lower bound for the true posterior using the approximate distribution.
"""
num_topics = self.lda.num_topics
gamma_sum = np.sum(self.gamma)
# to be used in DIM
# sigma_l = 0
# sigma_d = 0
lhood = gammaln(np.sum(self.lda.alpha)) - gammaln(gamma_sum)
self.lhood[num_topics] = lhood
# influence_term = 0
digsum = digamma(gamma_sum)
model = "DTM" # noqa:F841
for k in range(0, num_topics):
# below code only to be used in DIM mode
# if ldapost.doc_weight is not None and (model == "DIM" or model == "fixed"):
# influence_topic = ldapost.doc_weight[k]
# influence_term = \
# - ((influence_topic * influence_topic + sigma_l * sigma_l) / 2.0 / (sigma_d * sigma_d))
e_log_theta_k = digamma(self.gamma[k]) - digsum
lhood_term = \
(self.lda.alpha[k] - self.gamma[k]) * e_log_theta_k + \
gammaln(self.gamma[k]) - gammaln(self.lda.alpha[k])
# TODO: check why there's an IF
n = 0
for word_id, count in self.doc:
if self.phi[n][k] > 0:
lhood_term += \
count * self.phi[n][k] * (e_log_theta_k + self.lda.topics[word_id][k] - self.log_phi[n][k])
n += 1
self.lhood[k] = lhood_term
lhood += lhood_term
# in case of DIM add influence term
# lhood += influence_term
return lhood
def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
lda_inference_max_iter=25, g=None, g3_matrix=None, g4_matrix=None, g5_matrix=None):
"""Posterior inference for lda.
Parameters
----------
doc_number : int
The documents number.
time : int
Time slice.
ldaseq : object
Unused.
LDA_INFERENCE_CONVERGED : float
Epsilon value used to check whether the inference step has sufficiently converged.
lda_inference_max_iter : int
Maximum number of iterations in the inference step.
g : object
Unused. Will be useful when the DIM model is implemented.
g3_matrix: object
Unused. Will be useful when the DIM model is implemented.
g4_matrix: object
Unused. Will be useful when the DIM model is implemented.
g5_matrix: object
Unused. Will be useful when the DIM model is implemented.
Returns
-------
float
The optimal lower bound for the true posterior using the approximate distribution.
"""
self.init_lda_post()
# sum of counts in a doc
total = sum(count for word_id, count in self.doc)
model = "DTM"
if model == "DIM":
# if in DIM then we initialise some variables here
pass
lhood = self.compute_lda_lhood()
lhood_old = 0
converged = 0
iter_ = 0
# first iteration starts here
iter_ += 1
lhood_old = lhood
self.gamma = self.update_gamma()
model = "DTM"
if model == "DTM" or sslm is None:
self.phi, self.log_phi = self.update_phi(doc_number, time)
elif model == "DIM" and sslm is not None:
self.phi, self.log_phi = self.update_phi_fixed(doc_number, time, sslm, g3_matrix, g4_matrix, g5_matrix)
lhood = self.compute_lda_lhood()
converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
while converged > LDA_INFERENCE_CONVERGED and iter_ <= lda_inference_max_iter:
iter_ += 1
lhood_old = lhood
self.gamma = self.update_gamma()
model = "DTM"
if model == "DTM" or sslm is None:
self.phi, self.log_phi = self.update_phi(doc_number, time)
elif model == "DIM" and sslm is not None:
self.phi, self.log_phi = self.update_phi_fixed(doc_number, time, sslm, g3_matrix, g4_matrix, g5_matrix)
lhood = self.compute_lda_lhood()
converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
return lhood
def update_lda_seq_ss(self, time, doc, topic_suffstats):
"""Update lda sequence sufficient statistics from an lda posterior.
This is very similar to the :meth:`~gensim.models.ldaseqmodel.LdaPost.update_gamma` method and uses
the same formula.
Parameters
----------
time : int
The time slice.
doc : list of (int, float)
Unused but kept here for backwards compatibility. The document set in the constructor (`self.doc`) is used
instead.
topic_suffstats : list of float
Sufficient statistics for each topic.
Returns
-------
list of float
The updated sufficient statistics for each topic.
"""
num_topics = self.lda.num_topics
for k in range(0, num_topics):
topic_ss = topic_suffstats[k]
n = 0
for word_id, count in self.doc:
topic_ss[word_id][time] += count * self.phi[n][k]
n += 1
topic_suffstats[k] = topic_ss
return topic_suffstats
# the following functions are used in update_obs as the objective function.
def f_obs(x, *args):
"""Function which we are optimising for minimizing obs.
Parameters
----------
x : list of float
The obs values for this word.
sslm : :class:`~gensim.models.ldaseqmodel.sslm`
The State Space Language Model for DTM.
word_counts : list of int
Total word counts for each time slice.
totals : list of int of length `len(self.time_slice)`
The totals for each time slice.
mean_deriv_mtx : list of float
Mean derivative for each time slice.
word : int
The word's ID.
deriv : list of float
Mean derivative for each time slice.
Returns
-------
list of float
The value of the objective function evaluated at point `x`.
"""
sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args
# flag
init_mult = 1000
T = len(x)
val = 0
term1 = 0
term2 = 0
# term 3 and 4 for DIM
term3 = 0
term4 = 0
sslm.obs[word] = x
sslm.mean[word], sslm.fwd_mean[word] = sslm.compute_post_mean(word, sslm.chain_variance)
mean = sslm.mean[word]
variance = sslm.variance[word]
# only used for DIM mode
# w_phi_l = sslm.w_phi_l[word]
# m_update_coeff = sslm.m_update_coeff[word]
for t in range(1, T + 1):
mean_t = mean[t]
mean_t_prev = mean[t - 1]
val = mean_t - mean_t_prev
term1 += val * val
term2 += word_counts[t - 1] * mean_t - totals[t - 1] * np.exp(mean_t + variance[t] / 2) / sslm.zeta[t - 1]
model = "DTM"
if model == "DIM":
# stuff happens
pass
if sslm.chain_variance > 0.0:
term1 = - (term1 / (2 * sslm.chain_variance))
term1 = term1 - mean[0] * mean[0] / (2 * init_mult * sslm.chain_variance)
else:
term1 = 0.0
final = -(term1 + term2 + term3 + term4)
return final
def df_obs(x, *args):
"""Derivative of the objective function which optimises obs.
Parameters
----------
x : list of float
The obs values for this word.
sslm : :class:`~gensim.models.ldaseqmodel.sslm`
The State Space Language Model for DTM.
word_counts : list of int
Total word counts for each time slice.
totals : list of int of length `len(self.time_slice)`
The totals for each time slice.
mean_deriv_mtx : list of float
Mean derivative for each time slice.
word : int
The word's ID.
deriv : list of float
Mean derivative for each time slice.
Returns
-------
list of float
The derivative of the objective function evaluated at point `x`.
"""
sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args
sslm.obs[word] = x
sslm.mean[word], sslm.fwd_mean[word] = sslm.compute_post_mean(word, sslm.chain_variance)
model = "DTM"
if model == "DTM":
deriv = sslm.compute_obs_deriv(word, word_counts, totals, mean_deriv_mtx, deriv)
elif model == "DIM":
deriv = sslm.compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, deriv) # noqa:F821
return np.negative(deriv)