1638 lines
61 KiB
Python
1638 lines
61 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
#
|
||
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
||
# Based on Copyright (C) 2016 Radim Rehurek <radimrehurek@seznam.cz>
|
||
|
||
"""Lda Sequence model, inspired by `David M. Blei, John D. Lafferty: "Dynamic Topic Models"
|
||
<https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_ .
|
||
The original C/C++ implementation can be found on `blei-lab/dtm <https://github.com/blei-lab/dtm>`.
|
||
|
||
|
||
TODO: The next steps to take this forward would be:
|
||
|
||
#. Include DIM mode. Most of the infrastructure for this is in place.
|
||
#. See if LdaPost can be replaced by LdaModel completely without breaking anything.
|
||
#. Heavy lifting going on in the Sslm class - efforts can be made to cythonise mathematical methods, in particular,
|
||
update_obs and the optimization takes a lot time.
|
||
#. Try and make it distributed, especially around the E and M step.
|
||
#. Remove all C/C++ coding style/syntax.
|
||
|
||
Examples
|
||
--------
|
||
|
||
Set up a model using have 30 documents, with 5 in the first time-slice, 10 in the second, and 15 in the third
|
||
|
||
>>> from gensim.test.utils import common_corpus
|
||
>>> from gensim.models import LdaSeqModel
|
||
>>>
|
||
>>> ldaseq = LdaSeqModel(corpus=common_corpus, time_slice=[2, 4, 3], num_topics=2, chunksize=1)
|
||
|
||
Persist a model to disk and reload it later
|
||
|
||
>>> from gensim.test.utils import datapath
|
||
>>>
|
||
>>> temp_file = datapath("model")
|
||
>>> ldaseq.save(temp_file)
|
||
>>>
|
||
>>> # Load a potentially pre-trained model from disk.
|
||
>>> ldaseq = LdaSeqModel.load(temp_file)
|
||
|
||
Access the document embeddings generated from the DTM
|
||
|
||
>>> doc = common_corpus[1]
|
||
>>>
|
||
>>> embedding = ldaseq[doc]
|
||
|
||
"""
|
||
|
||
from gensim import utils, matutils
|
||
from gensim.models import ldamodel
|
||
import numpy as np
|
||
from scipy.special import digamma, gammaln
|
||
from scipy import optimize
|
||
import logging
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class LdaSeqModel(utils.SaveLoad):
|
||
"""Estimate Dynamic Topic Model parameters based on a training corpus."""
|
||
def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10,
|
||
initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10,
|
||
random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100):
|
||
"""
|
||
|
||
Parameters
|
||
----------
|
||
corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
|
||
Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`).
|
||
If not given, the model is left untrained (presumably because you want to call
|
||
:meth:`~gensim.models.ldamodel.LdaSeqModel.update` manually).
|
||
time_slice : list of int, optional
|
||
Number of documents in each time-slice. Each time slice could for example represent a year's published
|
||
papers, in case the corpus comes from a journal publishing over multiple years.
|
||
It is asummed that `sum(time_slice) == num_topics`.
|
||
id2word : dict of (int, str), optional
|
||
Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for
|
||
debugging and topic printing.
|
||
alphas : float, optional
|
||
The prior probability for the model.
|
||
num_topics : int, optional
|
||
The number of requested latent topics to be extracted from the training corpus.
|
||
initialize : {'gensim', 'own', 'ldamodel'}, optional
|
||
Controls the initialization of the DTM model. Supports three different modes:
|
||
* 'gensim': Uses gensim's LDA initialization.
|
||
* 'own': Uses your own initialization matrix of an LDA model that has been previously trained.
|
||
* 'lda_model': Use a previously used LDA model, passing it through the `lda_model` argument.
|
||
sstats : numpy.ndarray , optional
|
||
Sufficient statistics used for initializing the model if `initialize == 'own'`. Corresponds to matrix
|
||
beta in the linked paper for time slice 0, expected shape (`self.vocab_len`, `num_topics`).
|
||
lda_model : :class:`~gensim.models.ldamodel.LdaModel`
|
||
Model whose sufficient statistics will be used to initialize the current object if `initialize == 'gensim'`.
|
||
obs_variance : float, optional
|
||
Observed variance used to approximate the true and forward variance as shown in
|
||
`David M. Blei, John D. Lafferty: "Dynamic Topic Models"
|
||
<https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
|
||
chain_variance : float, optional
|
||
Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
|
||
passes : int, optional
|
||
Number of passes over the corpus for the initial :class:`~gensim.models.ldamodel.LdaModel`
|
||
random_state : {numpy.random.RandomState, int}, optional
|
||
Can be a np.random.RandomState object, or the seed to generate one. Used for reproducibility of results.
|
||
lda_inference_max_iter : int, optional
|
||
Maximum number of iterations in the inference step of the LDA training.
|
||
em_min_iter : int, optional
|
||
Minimum number of iterations until converge of the Expectation-Maximization algorithm
|
||
em_max_iter : int, optional
|
||
Maximum number of iterations until converge of the Expectation-Maximization algorithm.
|
||
chunksize : int, optional
|
||
Number of documents in the corpus do be processed in in a chunk.
|
||
|
||
"""
|
||
self.id2word = id2word
|
||
if corpus is None and self.id2word is None:
|
||
raise ValueError(
|
||
'at least one of corpus/id2word must be specified, to establish input space dimensionality'
|
||
)
|
||
|
||
if self.id2word is None:
|
||
logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
|
||
self.id2word = utils.dict_from_corpus(corpus)
|
||
self.vocab_len = len(self.id2word)
|
||
elif len(self.id2word) > 0:
|
||
self.vocab_len = len(self.id2word)
|
||
else:
|
||
self.vocab_len = 0
|
||
|
||
if corpus is not None:
|
||
try:
|
||
self.corpus_len = len(corpus)
|
||
except TypeError:
|
||
logger.warning("input corpus stream has no len(); counting documents")
|
||
self.corpus_len = sum(1 for _ in corpus)
|
||
|
||
self.time_slice = time_slice
|
||
if self.time_slice is not None:
|
||
self.num_time_slices = len(time_slice)
|
||
|
||
max_doc_len = 0
|
||
for line_no, line in enumerate(corpus):
|
||
if len(line) > max_doc_len:
|
||
max_doc_len = len(line)
|
||
self.max_doc_len = max_doc_len
|
||
|
||
self.num_topics = num_topics
|
||
self.num_time_slices = len(time_slice)
|
||
self.alphas = np.full(num_topics, alphas)
|
||
|
||
# topic_chains contains for each topic a 'state space language model' object
|
||
# which in turn has information about each topic
|
||
# the sslm class is described below and contains information
|
||
# on topic-word probabilities and doc-topic probabilities.
|
||
self.topic_chains = []
|
||
for topic in range(0, num_topics):
|
||
sslm_ = sslm(
|
||
num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics,
|
||
chain_variance=chain_variance, obs_variance=obs_variance
|
||
)
|
||
self.topic_chains.append(sslm_)
|
||
|
||
# the following are class variables which are to be integrated during Document Influence Model
|
||
self.top_doc_phis = None
|
||
self.influence = None
|
||
self.renormalized_influence = None
|
||
self.influence_sum_lgl = None
|
||
|
||
# if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM.
|
||
if corpus is not None and time_slice is not None:
|
||
if initialize == 'gensim':
|
||
lda_model = ldamodel.LdaModel(
|
||
corpus, id2word=self.id2word, num_topics=self.num_topics,
|
||
passes=passes, alpha=self.alphas, random_state=random_state,
|
||
dtype=np.float64
|
||
)
|
||
self.sstats = np.transpose(lda_model.state.sstats)
|
||
if initialize == 'ldamodel':
|
||
self.sstats = np.transpose(lda_model.state.sstats)
|
||
if initialize == 'own':
|
||
self.sstats = sstats
|
||
|
||
# initialize model from sstats
|
||
self.init_ldaseq_ss(chain_variance, obs_variance, self.alphas, self.sstats)
|
||
|
||
# fit DTM
|
||
self.fit_lda_seq(corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize)
|
||
|
||
def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_suffstats):
|
||
"""Initialize State Space Language Model, topic-wise.
|
||
|
||
Parameters
|
||
----------
|
||
topic_chain_variance : float
|
||
Gaussian parameter defined in the beta distribution to dictate how the beta values evolve.
|
||
topic_obs_variance : float
|
||
Observed variance used to approximate the true and forward variance as shown in
|
||
`David M. Blei, John D. Lafferty: "Dynamic Topic Models"
|
||
<https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
|
||
alpha : float
|
||
The prior probability for the model.
|
||
init_suffstats : numpy.ndarray
|
||
Sufficient statistics used for initializing the model, expected shape (`self.vocab_len`, `num_topics`).
|
||
|
||
"""
|
||
self.alphas = alpha
|
||
for k, chain in enumerate(self.topic_chains):
|
||
sstats = init_suffstats[:, k]
|
||
sslm.sslm_counts_init(chain, topic_obs_variance, topic_chain_variance, sstats)
|
||
|
||
# initialize the below matrices only if running DIM
|
||
# ldaseq.topic_chains[k].w_phi_l = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
|
||
# ldaseq.topic_chains[k].w_phi_sum = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
|
||
# ldaseq.topic_chains[k].w_phi_sq = np.zeros((ldaseq.vocab_len, ldaseq.num_time_slices))
|
||
|
||
def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize):
|
||
"""Fit a LDA Sequence model (DTM).
|
||
|
||
This method will iteratively setup LDA models and perform EM steps until the sufficient statistics convergence,
|
||
or until the maximum number of iterations is reached. Because the true posterior is intractable, an
|
||
appropriately tight lower bound must be used instead. This function will optimize this bound, by minimizing
|
||
its true Kullback-Liebler Divergence with the true posterior.
|
||
|
||
Parameters
|
||
----------
|
||
corpus : {iterable of list of (int, float), scipy.sparse.csc}
|
||
Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`).
|
||
lda_inference_max_iter : int
|
||
Maximum number of iterations for the inference step of LDA.
|
||
em_min_iter : int
|
||
Minimum number of time slices to be inspected.
|
||
em_max_iter : int
|
||
Maximum number of time slices to be inspected.
|
||
chunksize : int
|
||
Number of documents to be processed in each chunk.
|
||
|
||
Returns
|
||
-------
|
||
float
|
||
The highest lower bound for the true posterior produced after all iterations.
|
||
|
||
"""
|
||
LDASQE_EM_THRESHOLD = 1e-4
|
||
# if bound is low, then we increase iterations.
|
||
LOWER_ITER = 10
|
||
ITER_MULT_LOW = 2
|
||
MAX_ITER = 500
|
||
|
||
num_topics = self.num_topics
|
||
vocab_len = self.vocab_len
|
||
data_len = self.num_time_slices
|
||
corpus_len = self.corpus_len
|
||
|
||
bound = 0
|
||
convergence = LDASQE_EM_THRESHOLD + 1
|
||
iter_ = 0
|
||
|
||
while iter_ < em_min_iter or ((convergence > LDASQE_EM_THRESHOLD) and iter_ <= em_max_iter):
|
||
|
||
logger.info(" EM iter %i", iter_)
|
||
logger.info("E Step")
|
||
# TODO: bound is initialized to 0
|
||
old_bound = bound
|
||
|
||
# initiate sufficient statistics
|
||
topic_suffstats = []
|
||
for topic in range(0, num_topics):
|
||
topic_suffstats.append(np.resize(np.zeros(vocab_len * data_len), (vocab_len, data_len)))
|
||
|
||
# set up variables
|
||
gammas = np.resize(np.zeros(corpus_len * num_topics), (corpus_len, num_topics))
|
||
lhoods = np.resize(np.zeros(corpus_len * num_topics + 1), (corpus_len, num_topics + 1))
|
||
# compute the likelihood of a sequential corpus under an LDA
|
||
# seq model and find the evidence lower bound. This is the E - Step
|
||
bound, gammas = \
|
||
self.lda_seq_infer(corpus, topic_suffstats, gammas, lhoods, iter_, lda_inference_max_iter, chunksize)
|
||
self.gammas = gammas
|
||
|
||
logger.info("M Step")
|
||
|
||
# fit the variational distribution. This is the M - Step
|
||
topic_bound = self.fit_lda_seq_topics(topic_suffstats)
|
||
bound += topic_bound
|
||
|
||
if (bound - old_bound) < 0:
|
||
# if max_iter is too low, increase iterations.
|
||
if lda_inference_max_iter < LOWER_ITER:
|
||
lda_inference_max_iter *= ITER_MULT_LOW
|
||
logger.info("Bound went down, increasing iterations to %i", lda_inference_max_iter)
|
||
|
||
# check for convergence
|
||
convergence = np.fabs((bound - old_bound) / old_bound)
|
||
|
||
if convergence < LDASQE_EM_THRESHOLD:
|
||
|
||
lda_inference_max_iter = MAX_ITER
|
||
logger.info("Starting final iterations, max iter is %i", lda_inference_max_iter)
|
||
convergence = 1.0
|
||
|
||
logger.info("iteration %i iteration lda seq bound is %f convergence is %f", iter_, bound, convergence)
|
||
|
||
iter_ += 1
|
||
|
||
return bound
|
||
|
||
def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods,
|
||
iter_, lda_inference_max_iter, chunksize):
|
||
"""Inference (or E-step) for the lower bound EM optimization.
|
||
|
||
This is used to set up the gensim :class:`~gensim.models.ldamodel.LdaModel` to be used for each time-slice.
|
||
It also allows for Document Influence Model code to be written in.
|
||
|
||
Parameters
|
||
----------
|
||
corpus : {iterable of list of (int, float), scipy.sparse.csc}
|
||
Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`).
|
||
topic_suffstats : numpy.ndarray
|
||
Sufficient statistics for time slice 0, used for initializing the model if `initialize == 'own'`,
|
||
expected shape (`self.vocab_len`, `num_topics`).
|
||
gammas : numpy.ndarray
|
||
Topic weight variational parameters for each document. If not supplied, it will be inferred from the model.
|
||
lhoods : list of float
|
||
The total log probability lower bound for each topic. Corresponds to the phi variational parameters in the
|
||
linked paper.
|
||
iter_ : int
|
||
Current iteration.
|
||
lda_inference_max_iter : int
|
||
Maximum number of iterations for the inference step of LDA.
|
||
chunksize : int
|
||
Number of documents to be processed in each chunk.
|
||
|
||
Returns
|
||
-------
|
||
(float, list of float)
|
||
The first value is the highest lower bound for the true posterior.
|
||
The second value is the list of optimized dirichlet variational parameters for the approximation of
|
||
the posterior.
|
||
|
||
"""
|
||
num_topics = self.num_topics
|
||
vocab_len = self.vocab_len
|
||
bound = 0.0
|
||
|
||
lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
|
||
lda.topics = np.array(np.split(np.zeros(vocab_len * num_topics), vocab_len))
|
||
ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=num_topics, lda=lda)
|
||
|
||
model = "DTM"
|
||
if model == "DTM":
|
||
bound, gammas = self.inferDTMseq(
|
||
corpus, topic_suffstats, gammas, lhoods, lda,
|
||
ldapost, iter_, bound, lda_inference_max_iter, chunksize
|
||
)
|
||
elif model == "DIM":
|
||
self.InfluenceTotalFixed(corpus)
|
||
bound, gammas = self.inferDIMseq(
|
||
corpus, topic_suffstats, gammas, lhoods, lda,
|
||
ldapost, iter_, bound, lda_inference_max_iter, chunksize
|
||
)
|
||
|
||
return bound, gammas
|
||
|
||
def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda,
|
||
ldapost, iter_, bound, lda_inference_max_iter, chunksize):
|
||
"""Compute the likelihood of a sequential corpus under an LDA seq model, and reports the likelihood bound.
|
||
|
||
Parameters
|
||
----------
|
||
corpus : {iterable of list of (int, float), scipy.sparse.csc}
|
||
Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`).
|
||
topic_suffstats : numpy.ndarray
|
||
Sufficient statistics of the current model, expected shape (`self.vocab_len`, `num_topics`).
|
||
gammas : numpy.ndarray
|
||
Topic weight variational parameters for each document. If not supplied, it will be inferred from the model.
|
||
lhoods : list of float of length `self.num_topics`
|
||
The total log probability bound for each topic. Corresponds to phi from the linked paper.
|
||
lda : :class:`~gensim.models.ldamodel.LdaModel`
|
||
The trained LDA model of the previous iteration.
|
||
ldapost : :class:`~gensim.models.ldaseqmodel.LdaPost`
|
||
Posterior probability variables for the given LDA model. This will be used as the true (but intractable)
|
||
posterior.
|
||
iter_ : int
|
||
The current iteration.
|
||
bound : float
|
||
The LDA bound produced after all iterations.
|
||
lda_inference_max_iter : int
|
||
Maximum number of iterations for the inference step of LDA.
|
||
chunksize : int
|
||
Number of documents to be processed in each chunk.
|
||
|
||
Returns
|
||
-------
|
||
(float, list of float)
|
||
The first value is the highest lower bound for the true posterior.
|
||
The second value is the list of optimized dirichlet variational parameters for the approximation of
|
||
the posterior.
|
||
|
||
"""
|
||
doc_index = 0 # overall doc_index in corpus
|
||
time = 0 # current time-slice
|
||
doc_num = 0 # doc-index in current time-slice
|
||
lda = self.make_lda_seq_slice(lda, time) # create lda_seq slice
|
||
|
||
time_slice = np.cumsum(np.array(self.time_slice))
|
||
|
||
for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
|
||
# iterates chunk size for constant memory footprint
|
||
for doc in chunk:
|
||
# this is used to update the time_slice and create a new lda_seq slice every new time_slice
|
||
if doc_index > time_slice[time]:
|
||
time += 1
|
||
lda = self.make_lda_seq_slice(lda, time) # create lda_seq slice
|
||
doc_num = 0
|
||
|
||
gam = gammas[doc_index]
|
||
lhood = lhoods[doc_index]
|
||
|
||
ldapost.gamma = gam
|
||
ldapost.lhood = lhood
|
||
ldapost.doc = doc
|
||
|
||
# TODO: replace fit_lda_post with appropriate ldamodel functions, if possible.
|
||
if iter_ == 0:
|
||
doc_lhood = LdaPost.fit_lda_post(
|
||
ldapost, doc_num, time, None, lda_inference_max_iter=lda_inference_max_iter
|
||
)
|
||
else:
|
||
doc_lhood = LdaPost.fit_lda_post(
|
||
ldapost, doc_num, time, self, lda_inference_max_iter=lda_inference_max_iter
|
||
)
|
||
|
||
if topic_suffstats is not None:
|
||
topic_suffstats = LdaPost.update_lda_seq_ss(ldapost, time, doc, topic_suffstats)
|
||
|
||
gammas[doc_index] = ldapost.gamma
|
||
bound += doc_lhood
|
||
doc_index += 1
|
||
doc_num += 1
|
||
|
||
return bound, gammas
|
||
|
||
def make_lda_seq_slice(self, lda, time):
|
||
"""Update the LDA model topic-word values using time slices.
|
||
|
||
Parameters
|
||
----------
|
||
|
||
lda : :class:`~gensim.models.ldamodel.LdaModel`
|
||
The stationary model to be updated
|
||
time : int
|
||
The time slice assigned to the stationary model.
|
||
|
||
Returns
|
||
-------
|
||
lda : :class:`~gensim.models.ldamodel.LdaModel`
|
||
The stationary model updated to reflect the passed time slice.
|
||
|
||
"""
|
||
for k in range(0, self.num_topics):
|
||
lda.topics[:, k] = np.copy(self.topic_chains[k].e_log_prob[:, time])
|
||
|
||
lda.alpha = np.copy(self.alphas)
|
||
return lda
|
||
|
||
def fit_lda_seq_topics(self, topic_suffstats):
|
||
"""Fit the sequential model topic-wise.
|
||
|
||
Parameters
|
||
----------
|
||
topic_suffstats : numpy.ndarray
|
||
Sufficient statistics of the current model, expected shape (`self.vocab_len`, `num_topics`).
|
||
|
||
Returns
|
||
-------
|
||
float
|
||
The sum of the optimized lower bounds for all topics.
|
||
|
||
"""
|
||
lhood = 0
|
||
|
||
for k, chain in enumerate(self.topic_chains):
|
||
logger.info("Fitting topic number %i", k)
|
||
lhood_term = sslm.fit_sslm(chain, topic_suffstats[k])
|
||
lhood += lhood_term
|
||
|
||
return lhood
|
||
|
||
def print_topic_times(self, topic, top_terms=20):
|
||
"""Get the most relevant words for a topic, for each timeslice. This can be used to inspect the evolution of a
|
||
topic through time.
|
||
|
||
Parameters
|
||
----------
|
||
topic : int
|
||
The index of the topic.
|
||
top_terms : int, optional
|
||
Number of most relevant words associated with the topic to be returned.
|
||
|
||
Returns
|
||
-------
|
||
list of list of str
|
||
Top `top_terms` relevant terms for the topic for each time slice.
|
||
|
||
"""
|
||
topics = []
|
||
for time in range(0, self.num_time_slices):
|
||
topics.append(self.print_topic(topic, time, top_terms))
|
||
|
||
return topics
|
||
|
||
def print_topics(self, time=0, top_terms=20):
|
||
"""Get the most relevant words for every topic.
|
||
|
||
Parameters
|
||
----------
|
||
time : int, optional
|
||
The time slice in which we are interested in (since topics evolve over time, it is expected that the most
|
||
relevant words will also gradually change).
|
||
top_terms : int, optional
|
||
Number of most relevant words to be returned for each topic.
|
||
|
||
Returns
|
||
-------
|
||
list of list of (str, float)
|
||
Representation of all topics. Each of them is represented by a list of pairs of words and their assigned
|
||
probability.
|
||
|
||
"""
|
||
return [self.print_topic(topic, time, top_terms) for topic in range(0, self.num_topics)]
|
||
|
||
def print_topic(self, topic, time=0, top_terms=20):
|
||
"""Get the list of words most relevant to the given topic.
|
||
|
||
Parameters
|
||
----------
|
||
topic : int
|
||
The index of the topic to be inspected.
|
||
time : int, optional
|
||
The time slice in which we are interested in (since topics evolve over time, it is expected that the most
|
||
relevant words will also gradually change).
|
||
top_terms : int, optional
|
||
Number of words associated with the topic to be returned.
|
||
|
||
Returns
|
||
-------
|
||
list of (str, float)
|
||
The representation of this topic. Each element in the list includes the word itself, along with the
|
||
probability assigned to it by the topic.
|
||
|
||
"""
|
||
topic = self.topic_chains[topic].e_log_prob
|
||
topic = np.transpose(topic)
|
||
topic = np.exp(topic[time])
|
||
topic = topic / topic.sum()
|
||
bestn = matutils.argsort(topic, top_terms, reverse=True)
|
||
beststr = [(self.id2word[id_], topic[id_]) for id_ in bestn]
|
||
return beststr
|
||
|
||
def doc_topics(self, doc_number):
|
||
"""Get the topic mixture for a document.
|
||
|
||
Uses the priors for the dirichlet distribution that approximates the true posterior with the optimal
|
||
lower bound, and therefore requires the model to be already trained.
|
||
|
||
|
||
Parameters
|
||
----------
|
||
doc_number : int
|
||
Index of the document for which the mixture is returned.
|
||
|
||
Returns
|
||
-------
|
||
list of length `self.num_topics`
|
||
Probability for each topic in the mixture (essentially a point in the `self.num_topics - 1` simplex.
|
||
|
||
"""
|
||
doc_topic = np.copy(self.gammas)
|
||
doc_topic /= doc_topic.sum(axis=1)[:, np.newaxis]
|
||
return doc_topic[doc_number]
|
||
|
||
def dtm_vis(self, time, corpus):
|
||
"""Get the information needed to visualize the corpus model at a given time slice, using the pyLDAvis format.
|
||
|
||
Parameters
|
||
----------
|
||
time : int
|
||
The time slice we are interested in.
|
||
corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
|
||
The corpus we want to visualize at the given time slice.
|
||
|
||
Returns
|
||
-------
|
||
doc_topics : list of length `self.num_topics`
|
||
Probability for each topic in the mixture (essentially a point in the `self.num_topics - 1` simplex.
|
||
topic_term : numpy.ndarray
|
||
The representation of each topic as a multinomial over words in the vocabulary,
|
||
expected shape (`num_topics`, vocabulary length).
|
||
doc_lengths : list of int
|
||
The number of words in each document. These could be fixed, or drawn from a Poisson distribution.
|
||
term_frequency : numpy.ndarray
|
||
The term frequency matrix (denoted as beta in the original Blei paper). This could also be the TF-IDF
|
||
representation of the corpus, expected shape (number of documents, length of vocabulary).
|
||
vocab : list of str
|
||
The set of unique terms existing in the cropuse's vocabulary.
|
||
|
||
"""
|
||
doc_topic = np.copy(self.gammas)
|
||
doc_topic /= doc_topic.sum(axis=1)[:, np.newaxis]
|
||
|
||
topic_term = [
|
||
np.exp(np.transpose(chain.e_log_prob)[time]) / np.exp(np.transpose(chain.e_log_prob)[time]).sum()
|
||
for k, chain in enumerate(self.topic_chains)
|
||
]
|
||
|
||
doc_lengths = [len(doc) for doc_no, doc in enumerate(corpus)]
|
||
|
||
term_frequency = np.zeros(self.vocab_len)
|
||
for doc_no, doc in enumerate(corpus):
|
||
for pair in doc:
|
||
term_frequency[pair[0]] += pair[1]
|
||
|
||
vocab = [self.id2word[i] for i in range(0, len(self.id2word))]
|
||
|
||
return doc_topic, np.array(topic_term), doc_lengths, term_frequency, vocab
|
||
|
||
def dtm_coherence(self, time):
|
||
"""Get the coherence for each topic.
|
||
|
||
Can be used to measure the quality of the model, or to inspect the convergence through training via a callback.
|
||
|
||
Parameters
|
||
----------
|
||
time : int
|
||
The time slice.
|
||
|
||
Returns
|
||
-------
|
||
list of list of str
|
||
The word representation for each topic, for each time slice. This can be used to check the time coherence
|
||
of topics as time evolves: If the most relevant words remain the same then the topic has somehow
|
||
converged or is relatively static, if they change rapidly the topic is evolving.
|
||
|
||
"""
|
||
coherence_topics = []
|
||
for topics in self.print_topics(time):
|
||
coherence_topic = []
|
||
for word, dist in topics:
|
||
coherence_topic.append(word)
|
||
coherence_topics.append(coherence_topic)
|
||
|
||
return coherence_topics
|
||
|
||
def __getitem__(self, doc):
|
||
"""Get the topic mixture for the given document, using the inferred approximation of the true posterior.
|
||
|
||
Parameters
|
||
----------
|
||
doc : list of (int, float)
|
||
The doc in BOW format. Can be an unseen document.
|
||
|
||
Returns
|
||
-------
|
||
list of float
|
||
Probabilities for each topic in the mixture. This is essentially a point in the `num_topics - 1` simplex.
|
||
|
||
"""
|
||
lda_model = \
|
||
ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
|
||
lda_model.topics = np.array(np.split(np.zeros(self.vocab_len * self.num_topics), self.vocab_len))
|
||
ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc)
|
||
|
||
time_lhoods = []
|
||
for time in range(0, self.num_time_slices):
|
||
lda_model = self.make_lda_seq_slice(lda_model, time) # create lda_seq slice
|
||
lhood = LdaPost.fit_lda_post(ldapost, 0, time, self)
|
||
time_lhoods.append(lhood)
|
||
|
||
doc_topic = ldapost.gamma / ldapost.gamma.sum()
|
||
# should even the likelihoods be returned?
|
||
return doc_topic
|
||
|
||
|
||
class sslm(utils.SaveLoad):
|
||
"""Encapsulate the inner State Space Language Model for DTM.
|
||
|
||
Some important attributes of this class:
|
||
|
||
* `obs` is a matrix containing the document to topic ratios.
|
||
* `e_log_prob` is a matrix containing the topic to word ratios.
|
||
* `mean` contains the mean values to be used for inference for each word for a time slice.
|
||
* `variance` contains the variance values to be used for inference of word in a time slice.
|
||
* `fwd_mean` and`fwd_variance` are the forward posterior values for the mean and the variance.
|
||
* `zeta` is an extra variational parameter with a value for each time slice.
|
||
|
||
"""
|
||
|
||
def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_variance=0.5, chain_variance=0.005):
|
||
self.vocab_len = vocab_len
|
||
self.num_time_slices = num_time_slices
|
||
self.obs_variance = obs_variance
|
||
self.chain_variance = chain_variance
|
||
self.num_topics = num_topics
|
||
|
||
# setting up matrices
|
||
self.obs = np.array(np.split(np.zeros(num_time_slices * vocab_len), vocab_len))
|
||
self.e_log_prob = np.array(np.split(np.zeros(num_time_slices * vocab_len), vocab_len))
|
||
self.mean = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
|
||
self.fwd_mean = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
|
||
self.fwd_variance = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
|
||
self.variance = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
|
||
self.zeta = np.zeros(num_time_slices)
|
||
|
||
# the following are class variables which are to be integrated during Document Influence Model
|
||
self.m_update_coeff = None
|
||
self.mean_t = None
|
||
self.variance_t = None
|
||
self.influence_sum_lgl = None
|
||
self.w_phi_l = None
|
||
self.w_phi_sum = None
|
||
self.w_phi_l_sq = None
|
||
self.m_update_coeff_g = None
|
||
|
||
def update_zeta(self):
|
||
"""Update the Zeta variational parameter.
|
||
|
||
Zeta is described in the appendix and is equal to sum (exp(mean[word] + Variance[word] / 2)),
|
||
over every time-slice. It is the value of variational parameter zeta which maximizes the lower bound.
|
||
|
||
Returns
|
||
-------
|
||
list of float
|
||
The updated zeta values for each time slice.
|
||
|
||
"""
|
||
for j, val in enumerate(self.zeta):
|
||
self.zeta[j] = np.sum(np.exp(self.mean[:, j + 1] + self.variance[:, j + 1] / 2))
|
||
return self.zeta
|
||
|
||
def compute_post_variance(self, word, chain_variance):
|
||
"""Get the variance, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
|
||
<https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
|
||
|
||
This function accepts the word to compute variance for, along with the associated sslm class object,
|
||
and returns the `variance` and the posterior approximation `fwd_variance`.
|
||
|
||
Notes
|
||
-----
|
||
This function essentially computes Var[\beta_{t,w}] for t = 1:T
|
||
|
||
.. :math::
|
||
|
||
fwd\_variance[t] \equiv E((beta_{t,w}-mean_{t,w})^2 |beta_{t}\ for\ 1:t) =
|
||
(obs\_variance / fwd\_variance[t - 1] + chain\_variance + obs\_variance ) *
|
||
(fwd\_variance[t - 1] + obs\_variance)
|
||
|
||
.. :math::
|
||
|
||
variance[t] \equiv E((beta_{t,w}-mean\_cap_{t,w})^2 |beta\_cap_{t}\ for\ 1:t) =
|
||
fwd\_variance[t - 1] + (fwd\_variance[t - 1] / fwd\_variance[t - 1] + obs\_variance)^2 *
|
||
(variance[t - 1] - (fwd\_variance[t-1] + obs\_variance))
|
||
|
||
Parameters
|
||
----------
|
||
word: int
|
||
The word's ID.
|
||
chain_variance : float
|
||
Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
|
||
|
||
Returns
|
||
-------
|
||
(numpy.ndarray, numpy.ndarray)
|
||
The first returned value is the variance of each word in each time slice, the second value is the
|
||
inferred posterior variance for the same pairs.
|
||
|
||
"""
|
||
INIT_VARIANCE_CONST = 1000
|
||
|
||
T = self.num_time_slices
|
||
variance = self.variance[word]
|
||
fwd_variance = self.fwd_variance[word]
|
||
# forward pass. Set initial variance very high
|
||
fwd_variance[0] = chain_variance * INIT_VARIANCE_CONST
|
||
for t in range(1, T + 1):
|
||
if self.obs_variance:
|
||
c = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance)
|
||
else:
|
||
c = 0
|
||
fwd_variance[t] = c * (fwd_variance[t - 1] + chain_variance)
|
||
|
||
# backward pass
|
||
variance[T] = fwd_variance[T]
|
||
for t in range(T - 1, -1, -1):
|
||
if fwd_variance[t] > 0.0:
|
||
c = np.power((fwd_variance[t] / (fwd_variance[t] + chain_variance)), 2)
|
||
else:
|
||
c = 0
|
||
variance[t] = (c * (variance[t + 1] - chain_variance)) + ((1 - c) * fwd_variance[t])
|
||
|
||
return variance, fwd_variance
|
||
|
||
def compute_post_mean(self, word, chain_variance):
|
||
"""Get the mean, based on the `Variational Kalman Filtering approach for Approximate Inference (section 3.1)
|
||
<https://mimno.infosci.cornell.edu/info6150/readings/dynamic_topic_models.pdf>`_.
|
||
|
||
Notes
|
||
-----
|
||
This function essentially computes E[\beta_{t,w}] for t = 1:T.
|
||
|
||
.. :math::
|
||
|
||
Fwd_Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:t )
|
||
= (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * fwd_mean[t - 1] +
|
||
(1 - (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance)) * beta
|
||
|
||
.. :math::
|
||
|
||
Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:T )
|
||
= fwd_mean[t - 1] + (obs_variance / fwd_variance[t - 1] + obs_variance) +
|
||
(1 - obs_variance / fwd_variance[t - 1] + obs_variance)) * mean[t]
|
||
|
||
Parameters
|
||
----------
|
||
word: int
|
||
The word's ID.
|
||
chain_variance : float
|
||
Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
|
||
|
||
Returns
|
||
-------
|
||
(numpy.ndarray, numpy.ndarray)
|
||
The first returned value is the mean of each word in each time slice, the second value is the
|
||
inferred posterior mean for the same pairs.
|
||
|
||
"""
|
||
T = self.num_time_slices
|
||
obs = self.obs[word]
|
||
fwd_variance = self.fwd_variance[word]
|
||
mean = self.mean[word]
|
||
fwd_mean = self.fwd_mean[word]
|
||
|
||
# forward
|
||
fwd_mean[0] = 0
|
||
for t in range(1, T + 1):
|
||
c = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance)
|
||
fwd_mean[t] = c * fwd_mean[t - 1] + (1 - c) * obs[t - 1]
|
||
|
||
# backward pass
|
||
mean[T] = fwd_mean[T]
|
||
for t in range(T - 1, -1, -1):
|
||
if chain_variance == 0.0:
|
||
c = 0.0
|
||
else:
|
||
c = chain_variance / (fwd_variance[t] + chain_variance)
|
||
mean[t] = c * fwd_mean[t] + (1 - c) * mean[t + 1]
|
||
return mean, fwd_mean
|
||
|
||
def compute_expected_log_prob(self):
|
||
"""Compute the expected log probability given values of m.
|
||
|
||
The appendix describes the Expectation of log-probabilities in equation 5 of the DTM paper;
|
||
The below implementation is the result of solving the equation and is implemented as in the original
|
||
Blei DTM code.
|
||
|
||
Returns
|
||
-------
|
||
numpy.ndarray of float
|
||
The expected value for the log probabilities for each word and time slice.
|
||
|
||
"""
|
||
for (w, t), val in np.ndenumerate(self.e_log_prob):
|
||
self.e_log_prob[w][t] = self.mean[w][t + 1] - np.log(self.zeta[t])
|
||
return self.e_log_prob
|
||
|
||
def sslm_counts_init(self, obs_variance, chain_variance, sstats):
|
||
"""Initialize the State Space Language Model with LDA sufficient statistics.
|
||
|
||
Called for each topic-chain and initializes initial mean, variance and Topic-Word probabilities
|
||
for the first time-slice.
|
||
|
||
Parameters
|
||
----------
|
||
obs_variance : float, optional
|
||
Observed variance used to approximate the true and forward variance.
|
||
chain_variance : float
|
||
Gaussian parameter defined in the beta distribution to dictate how the beta values evolve over time.
|
||
sstats : numpy.ndarray
|
||
Sufficient statistics of the LDA model. Corresponds to matrix beta in the linked paper for time slice 0,
|
||
expected shape (`self.vocab_len`, `num_topics`).
|
||
|
||
"""
|
||
W = self.vocab_len
|
||
T = self.num_time_slices
|
||
|
||
log_norm_counts = np.copy(sstats)
|
||
log_norm_counts = log_norm_counts / sum(log_norm_counts)
|
||
log_norm_counts = log_norm_counts + 1.0 / W
|
||
log_norm_counts = log_norm_counts / sum(log_norm_counts)
|
||
log_norm_counts = np.log(log_norm_counts)
|
||
|
||
# setting variational observations to transformed counts
|
||
self.obs = (np.repeat(log_norm_counts, T, axis=0)).reshape(W, T)
|
||
# set variational parameters
|
||
self.obs_variance = obs_variance
|
||
self.chain_variance = chain_variance
|
||
|
||
# compute post variance, mean
|
||
for w in range(0, W):
|
||
self.variance[w], self.fwd_variance[w] = self.compute_post_variance(w, self.chain_variance)
|
||
self.mean[w], self.fwd_mean[w] = self.compute_post_mean(w, self.chain_variance)
|
||
|
||
self.zeta = self.update_zeta()
|
||
self.e_log_prob = self.compute_expected_log_prob()
|
||
|
||
def fit_sslm(self, sstats):
|
||
"""Fits variational distribution.
|
||
|
||
This is essentially the m-step.
|
||
Maximizes the approximation of the true posterior for a particular topic using the provided sufficient
|
||
statistics. Updates the values using :meth:`~gensim.models.ldaseqmodel.sslm.update_obs` and
|
||
:meth:`~gensim.models.ldaseqmodel.sslm.compute_expected_log_prob`.
|
||
|
||
Parameters
|
||
----------
|
||
sstats : numpy.ndarray
|
||
Sufficient statistics for a particular topic. Corresponds to matrix beta in the linked paper for the
|
||
current time slice, expected shape (`self.vocab_len`, `num_topics`).
|
||
|
||
Returns
|
||
-------
|
||
float
|
||
The lower bound for the true posterior achieved using the fitted approximate distribution.
|
||
|
||
"""
|
||
W = self.vocab_len
|
||
bound = 0
|
||
old_bound = 0
|
||
sslm_fit_threshold = 1e-6
|
||
sslm_max_iter = 2
|
||
converged = sslm_fit_threshold + 1
|
||
|
||
# computing variance, fwd_variance
|
||
self.variance, self.fwd_variance = \
|
||
(np.array(x) for x in list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)])))
|
||
|
||
# column sum of sstats
|
||
totals = sstats.sum(axis=0)
|
||
iter_ = 0
|
||
|
||
model = "DTM"
|
||
if model == "DTM":
|
||
bound = self.compute_bound(sstats, totals)
|
||
if model == "DIM":
|
||
bound = self.compute_bound_fixed(sstats, totals)
|
||
|
||
logger.info("initial sslm bound is %f", bound)
|
||
|
||
while converged > sslm_fit_threshold and iter_ < sslm_max_iter:
|
||
iter_ += 1
|
||
old_bound = bound
|
||
self.obs, self.zeta = self.update_obs(sstats, totals)
|
||
|
||
if model == "DTM":
|
||
bound = self.compute_bound(sstats, totals)
|
||
if model == "DIM":
|
||
bound = self.compute_bound_fixed(sstats, totals)
|
||
|
||
converged = np.fabs((bound - old_bound) / old_bound)
|
||
logger.info("iteration %i iteration lda seq bound is %f convergence is %f", iter_, bound, converged)
|
||
|
||
self.e_log_prob = self.compute_expected_log_prob()
|
||
return bound
|
||
|
||
def compute_bound(self, sstats, totals):
|
||
"""Compute the maximized lower bound achieved for the log probability of the true posterior.
|
||
|
||
Uses the formula presented in the appendix of the DTM paper (formula no. 5).
|
||
|
||
Parameters
|
||
----------
|
||
sstats : numpy.ndarray
|
||
Sufficient statistics for a particular topic. Corresponds to matrix beta in the linked paper for the first
|
||
time slice, expected shape (`self.vocab_len`, `num_topics`).
|
||
totals : list of int of length `len(self.time_slice)`
|
||
The totals for each time slice.
|
||
|
||
Returns
|
||
-------
|
||
float
|
||
The maximized lower bound.
|
||
|
||
"""
|
||
w = self.vocab_len
|
||
t = self.num_time_slices
|
||
|
||
term_1 = 0
|
||
term_2 = 0
|
||
term_3 = 0
|
||
|
||
val = 0
|
||
ent = 0
|
||
|
||
chain_variance = self.chain_variance
|
||
# computing mean, fwd_mean
|
||
self.mean, self.fwd_mean = \
|
||
(np.array(x) for x in zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, w)]))
|
||
self.zeta = self.update_zeta()
|
||
|
||
for w in range(0, w):
|
||
val += (self.variance[w][0] - self.variance[w][t]) / 2 * chain_variance
|
||
|
||
logger.info("Computing bound, all times")
|
||
|
||
for t in range(1, t + 1):
|
||
term_1 = 0.0
|
||
term_2 = 0.0
|
||
ent = 0.0
|
||
for w in range(0, w):
|
||
|
||
m = self.mean[w][t]
|
||
prev_m = self.mean[w][t - 1]
|
||
|
||
v = self.variance[w][t]
|
||
|
||
# w_phi_l is only used in Document Influence Model; the values are always zero in this case
|
||
# w_phi_l = sslm.w_phi_l[w][t - 1]
|
||
# exp_i = np.exp(-prev_m)
|
||
# term_1 += (np.power(m - prev_m - (w_phi_l * exp_i), 2) / (2 * chain_variance)) -
|
||
# (v / chain_variance) - np.log(chain_variance)
|
||
|
||
term_1 += \
|
||
(np.power(m - prev_m, 2) / (2 * chain_variance)) - (v / chain_variance) - np.log(chain_variance)
|
||
term_2 += sstats[w][t - 1] * m
|
||
ent += np.log(v) / 2 # note the 2pi's cancel with term1 (see doc)
|
||
|
||
term_3 = -totals[t - 1] * np.log(self.zeta[t - 1])
|
||
val += term_2 + term_3 + ent - term_1
|
||
|
||
return val
|
||
|
||
def update_obs(self, sstats, totals):
|
||
"""Optimize the bound with respect to the observed variables.
|
||
|
||
TODO:
|
||
This is by far the slowest function in the whole algorithm.
|
||
Replacing or improving the performance of this would greatly speed things up.
|
||
|
||
Parameters
|
||
----------
|
||
sstats : numpy.ndarray
|
||
Sufficient statistics for a particular topic. Corresponds to matrix beta in the linked paper for the first
|
||
time slice, expected shape (`self.vocab_len`, `num_topics`).
|
||
totals : list of int of length `len(self.time_slice)`
|
||
The totals for each time slice.
|
||
|
||
Returns
|
||
-------
|
||
(numpy.ndarray of float, numpy.ndarray of float)
|
||
The updated optimized values for obs and the zeta variational parameter.
|
||
|
||
"""
|
||
|
||
OBS_NORM_CUTOFF = 2
|
||
STEP_SIZE = 0.01
|
||
TOL = 1e-3
|
||
|
||
W = self.vocab_len
|
||
T = self.num_time_slices
|
||
|
||
runs = 0
|
||
mean_deriv_mtx = np.resize(np.zeros(T * (T + 1)), (T, T + 1))
|
||
|
||
norm_cutoff_obs = None
|
||
for w in range(0, W):
|
||
w_counts = sstats[w]
|
||
counts_norm = 0
|
||
# now we find L2 norm of w_counts
|
||
for i in range(0, len(w_counts)):
|
||
counts_norm += w_counts[i] * w_counts[i]
|
||
|
||
counts_norm = np.sqrt(counts_norm)
|
||
|
||
if counts_norm < OBS_NORM_CUTOFF and norm_cutoff_obs is not None:
|
||
obs = self.obs[w]
|
||
norm_cutoff_obs = np.copy(obs)
|
||
else:
|
||
if counts_norm < OBS_NORM_CUTOFF:
|
||
w_counts = np.zeros(len(w_counts))
|
||
|
||
# TODO: apply lambda function
|
||
for t in range(0, T):
|
||
mean_deriv = mean_deriv_mtx[t]
|
||
mean_deriv = self.compute_mean_deriv(w, t, mean_deriv)
|
||
mean_deriv_mtx[t] = mean_deriv
|
||
|
||
deriv = np.zeros(T)
|
||
args = self, w_counts, totals, mean_deriv_mtx, w, deriv
|
||
obs = self.obs[w]
|
||
model = "DTM"
|
||
|
||
if model == "DTM":
|
||
# slowest part of method
|
||
obs = optimize.fmin_cg(
|
||
f=f_obs, fprime=df_obs, x0=obs, gtol=TOL, args=args, epsilon=STEP_SIZE, disp=0
|
||
)
|
||
if model == "DIM":
|
||
pass
|
||
runs += 1
|
||
|
||
if counts_norm < OBS_NORM_CUTOFF:
|
||
norm_cutoff_obs = obs
|
||
|
||
self.obs[w] = obs
|
||
|
||
self.zeta = self.update_zeta()
|
||
|
||
return self.obs, self.zeta
|
||
|
||
def compute_mean_deriv(self, word, time, deriv):
|
||
"""Helper functions for optimizing a function.
|
||
|
||
Compute the derivative of:
|
||
|
||
.. :math::
|
||
|
||
E[\beta_{t,w}]/d obs_{s,w} for t = 1:T.
|
||
|
||
Parameters
|
||
----------
|
||
word : int
|
||
The word's ID.
|
||
time : int
|
||
The time slice.
|
||
deriv : list of float
|
||
Derivative for each time slice.
|
||
|
||
Returns
|
||
-------
|
||
list of float
|
||
Mean derivative for each time slice.
|
||
|
||
"""
|
||
|
||
T = self.num_time_slices
|
||
fwd_variance = self.variance[word]
|
||
|
||
deriv[0] = 0
|
||
|
||
# forward pass
|
||
for t in range(1, T + 1):
|
||
if self.obs_variance > 0.0:
|
||
w = self.obs_variance / (fwd_variance[t - 1] + self.chain_variance + self.obs_variance)
|
||
else:
|
||
w = 0.0
|
||
val = w * deriv[t - 1]
|
||
if time == t - 1:
|
||
val += (1 - w)
|
||
deriv[t] = val
|
||
|
||
for t in range(T - 1, -1, -1):
|
||
if self.chain_variance == 0.0:
|
||
w = 0.0
|
||
else:
|
||
w = self.chain_variance / (fwd_variance[t] + self.chain_variance)
|
||
deriv[t] = w * deriv[t] + (1 - w) * deriv[t + 1]
|
||
|
||
return deriv
|
||
|
||
def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv):
|
||
"""Derivation of obs which is used in derivative function `df_obs` while optimizing.
|
||
|
||
Parameters
|
||
----------
|
||
word : int
|
||
The word's ID.
|
||
word_counts : list of int
|
||
Total word counts for each time slice.
|
||
totals : list of int of length `len(self.time_slice)`
|
||
The totals for each time slice.
|
||
mean_deriv_mtx : list of float
|
||
Mean derivative for each time slice.
|
||
deriv : list of float
|
||
Mean derivative for each time slice.
|
||
|
||
Returns
|
||
-------
|
||
list of float
|
||
Mean derivative for each time slice.
|
||
|
||
"""
|
||
|
||
# flag
|
||
init_mult = 1000
|
||
|
||
T = self.num_time_slices
|
||
|
||
mean = self.mean[word]
|
||
variance = self.variance[word]
|
||
|
||
# only used for DIM mode
|
||
# w_phi_l = self.w_phi_l[word]
|
||
# m_update_coeff = self.m_update_coeff[word]
|
||
|
||
# temp_vector holds temporary zeta values
|
||
self.temp_vect = np.zeros(T)
|
||
|
||
for u in range(0, T):
|
||
self.temp_vect[u] = np.exp(mean[u + 1] + variance[u + 1] / 2)
|
||
|
||
for t in range(0, T):
|
||
mean_deriv = mean_deriv_mtx[t]
|
||
term1 = 0
|
||
term2 = 0
|
||
term3 = 0
|
||
term4 = 0
|
||
|
||
for u in range(1, T + 1):
|
||
mean_u = mean[u]
|
||
mean_u_prev = mean[u - 1]
|
||
dmean_u = mean_deriv[u]
|
||
dmean_u_prev = mean_deriv[u - 1]
|
||
|
||
term1 += (mean_u - mean_u_prev) * (dmean_u - dmean_u_prev)
|
||
term2 += (word_counts[u - 1] - (totals[u - 1] * self.temp_vect[u - 1] / self.zeta[u - 1])) * dmean_u
|
||
|
||
model = "DTM"
|
||
if model == "DIM":
|
||
# do some stuff
|
||
pass
|
||
|
||
if self.chain_variance:
|
||
term1 = - (term1 / self.chain_variance)
|
||
term1 = term1 - (mean[0] * mean_deriv[0]) / (init_mult * self.chain_variance)
|
||
else:
|
||
term1 = 0.0
|
||
|
||
deriv[t] = term1 + term2 + term3 + term4
|
||
|
||
return deriv
|
||
|
||
|
||
class LdaPost(utils.SaveLoad):
|
||
"""Posterior values associated with each set of documents.
|
||
|
||
TODO: use **Hoffman, Blei, Bach: Online Learning for Latent Dirichlet Allocation, NIPS 2010.**
|
||
to update phi, gamma. End game would be to somehow replace LdaPost entirely with LdaModel.
|
||
|
||
"""
|
||
|
||
def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma=None, lhood=None):
|
||
"""Initialize the posterior value structure for the given LDA model.
|
||
|
||
Parameters
|
||
----------
|
||
doc : list of (int, int)
|
||
A BOW representation of the document. Each element in the list is a pair of a word's ID and its number
|
||
of occurences in the document.
|
||
lda : :class:`~gensim.models.ldamodel.LdaModel`, optional
|
||
The underlying LDA model.
|
||
max_doc_len : int, optional
|
||
The maximum number of words in a document.
|
||
num_topics : int, optional
|
||
Number of topics discovered by the LDA model.
|
||
gamma : numpy.ndarray, optional
|
||
Topic weight variational parameters for each document. If not supplied, it will be inferred from the model.
|
||
lhood : float, optional
|
||
The log likelihood lower bound.
|
||
|
||
"""
|
||
self.doc = doc
|
||
self.lda = lda
|
||
self.gamma = gamma
|
||
self.lhood = lhood
|
||
if self.gamma is None:
|
||
self.gamma = np.zeros(num_topics)
|
||
if self.lhood is None:
|
||
self.lhood = np.zeros(num_topics + 1)
|
||
|
||
if max_doc_len is not None and num_topics is not None:
|
||
self.phi = np.resize(np.zeros(max_doc_len * num_topics), (max_doc_len, num_topics))
|
||
self.log_phi = np.resize(np.zeros(max_doc_len * num_topics), (max_doc_len, num_topics))
|
||
|
||
# the following are class variables which are to be integrated during Document Influence Model
|
||
|
||
self.doc_weight = None
|
||
self.renormalized_doc_weight = None
|
||
|
||
def update_phi(self, doc_number, time):
|
||
"""Update variational multinomial parameters, based on a document and a time-slice.
|
||
|
||
This is done based on the original Blei-LDA paper, where:
|
||
log_phi := beta * exp(Ψ(gamma)), over every topic for every word.
|
||
|
||
TODO: incorporate lee-sueng trick used in
|
||
**Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**.
|
||
|
||
Parameters
|
||
----------
|
||
doc_number : int
|
||
Document number. Unused.
|
||
time : int
|
||
Time slice. Unused.
|
||
|
||
Returns
|
||
-------
|
||
(list of float, list of float)
|
||
Multinomial parameters, and their logarithm, for each word in the document.
|
||
|
||
"""
|
||
num_topics = self.lda.num_topics
|
||
# digamma values
|
||
dig = np.zeros(num_topics)
|
||
|
||
for k in range(0, num_topics):
|
||
dig[k] = digamma(self.gamma[k])
|
||
|
||
n = 0 # keep track of iterations for phi, log_phi
|
||
for word_id, count in self.doc:
|
||
for k in range(0, num_topics):
|
||
self.log_phi[n][k] = dig[k] + self.lda.topics[word_id][k]
|
||
|
||
log_phi_row = self.log_phi[n]
|
||
phi_row = self.phi[n]
|
||
|
||
# log normalize
|
||
v = log_phi_row[0]
|
||
for i in range(1, len(log_phi_row)):
|
||
v = np.logaddexp(v, log_phi_row[i])
|
||
|
||
# subtract every element by v
|
||
log_phi_row = log_phi_row - v
|
||
phi_row = np.exp(log_phi_row)
|
||
self.log_phi[n] = log_phi_row
|
||
self.phi[n] = phi_row
|
||
n += 1 # increase iteration
|
||
|
||
return self.phi, self.log_phi
|
||
|
||
def update_gamma(self):
|
||
"""Update variational dirichlet parameters.
|
||
|
||
This operations is described in the original Blei LDA paper:
|
||
gamma = alpha + sum(phi), over every topic for every word.
|
||
|
||
Returns
|
||
-------
|
||
list of float
|
||
The updated gamma parameters for each word in the document.
|
||
|
||
"""
|
||
self.gamma = np.copy(self.lda.alpha)
|
||
n = 0 # keep track of number of iterations for phi, log_phi
|
||
for word_id, count in self.doc:
|
||
phi_row = self.phi[n]
|
||
for k in range(0, self.lda.num_topics):
|
||
self.gamma[k] += phi_row[k] * count
|
||
n += 1
|
||
|
||
return self.gamma
|
||
|
||
def init_lda_post(self):
|
||
"""Initialize variational posterior. """
|
||
total = sum(count for word_id, count in self.doc)
|
||
self.gamma.fill(self.lda.alpha[0] + float(total) / self.lda.num_topics)
|
||
self.phi[:len(self.doc), :] = 1.0 / self.lda.num_topics
|
||
# doc_weight used during DIM
|
||
# ldapost.doc_weight = None
|
||
|
||
def compute_lda_lhood(self):
|
||
"""Compute the log likelihood bound.
|
||
|
||
Returns
|
||
-------
|
||
float
|
||
The optimal lower bound for the true posterior using the approximate distribution.
|
||
|
||
"""
|
||
num_topics = self.lda.num_topics
|
||
gamma_sum = np.sum(self.gamma)
|
||
|
||
# to be used in DIM
|
||
# sigma_l = 0
|
||
# sigma_d = 0
|
||
|
||
lhood = gammaln(np.sum(self.lda.alpha)) - gammaln(gamma_sum)
|
||
self.lhood[num_topics] = lhood
|
||
|
||
# influence_term = 0
|
||
digsum = digamma(gamma_sum)
|
||
|
||
model = "DTM" # noqa:F841
|
||
for k in range(0, num_topics):
|
||
# below code only to be used in DIM mode
|
||
# if ldapost.doc_weight is not None and (model == "DIM" or model == "fixed"):
|
||
# influence_topic = ldapost.doc_weight[k]
|
||
# influence_term = \
|
||
# - ((influence_topic * influence_topic + sigma_l * sigma_l) / 2.0 / (sigma_d * sigma_d))
|
||
|
||
e_log_theta_k = digamma(self.gamma[k]) - digsum
|
||
lhood_term = \
|
||
(self.lda.alpha[k] - self.gamma[k]) * e_log_theta_k + \
|
||
gammaln(self.gamma[k]) - gammaln(self.lda.alpha[k])
|
||
# TODO: check why there's an IF
|
||
n = 0
|
||
for word_id, count in self.doc:
|
||
if self.phi[n][k] > 0:
|
||
lhood_term += \
|
||
count * self.phi[n][k] * (e_log_theta_k + self.lda.topics[word_id][k] - self.log_phi[n][k])
|
||
n += 1
|
||
self.lhood[k] = lhood_term
|
||
lhood += lhood_term
|
||
# in case of DIM add influence term
|
||
# lhood += influence_term
|
||
|
||
return lhood
|
||
|
||
def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED=1e-8,
|
||
lda_inference_max_iter=25, g=None, g3_matrix=None, g4_matrix=None, g5_matrix=None):
|
||
"""Posterior inference for lda.
|
||
|
||
Parameters
|
||
----------
|
||
doc_number : int
|
||
The documents number.
|
||
time : int
|
||
Time slice.
|
||
ldaseq : object
|
||
Unused.
|
||
LDA_INFERENCE_CONVERGED : float
|
||
Epsilon value used to check whether the inference step has sufficiently converged.
|
||
lda_inference_max_iter : int
|
||
Maximum number of iterations in the inference step.
|
||
g : object
|
||
Unused. Will be useful when the DIM model is implemented.
|
||
g3_matrix: object
|
||
Unused. Will be useful when the DIM model is implemented.
|
||
g4_matrix: object
|
||
Unused. Will be useful when the DIM model is implemented.
|
||
g5_matrix: object
|
||
Unused. Will be useful when the DIM model is implemented.
|
||
|
||
Returns
|
||
-------
|
||
float
|
||
The optimal lower bound for the true posterior using the approximate distribution.
|
||
"""
|
||
|
||
self.init_lda_post()
|
||
# sum of counts in a doc
|
||
total = sum(count for word_id, count in self.doc)
|
||
|
||
model = "DTM"
|
||
if model == "DIM":
|
||
# if in DIM then we initialise some variables here
|
||
pass
|
||
|
||
lhood = self.compute_lda_lhood()
|
||
lhood_old = 0
|
||
converged = 0
|
||
iter_ = 0
|
||
|
||
# first iteration starts here
|
||
iter_ += 1
|
||
lhood_old = lhood
|
||
self.gamma = self.update_gamma()
|
||
|
||
model = "DTM"
|
||
|
||
if model == "DTM" or sslm is None:
|
||
self.phi, self.log_phi = self.update_phi(doc_number, time)
|
||
elif model == "DIM" and sslm is not None:
|
||
self.phi, self.log_phi = self.update_phi_fixed(doc_number, time, sslm, g3_matrix, g4_matrix, g5_matrix)
|
||
|
||
lhood = self.compute_lda_lhood()
|
||
converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
|
||
|
||
while converged > LDA_INFERENCE_CONVERGED and iter_ <= lda_inference_max_iter:
|
||
|
||
iter_ += 1
|
||
lhood_old = lhood
|
||
self.gamma = self.update_gamma()
|
||
model = "DTM"
|
||
|
||
if model == "DTM" or sslm is None:
|
||
self.phi, self.log_phi = self.update_phi(doc_number, time)
|
||
elif model == "DIM" and sslm is not None:
|
||
self.phi, self.log_phi = self.update_phi_fixed(doc_number, time, sslm, g3_matrix, g4_matrix, g5_matrix)
|
||
|
||
lhood = self.compute_lda_lhood()
|
||
converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
|
||
|
||
return lhood
|
||
|
||
def update_lda_seq_ss(self, time, doc, topic_suffstats):
|
||
"""Update lda sequence sufficient statistics from an lda posterior.
|
||
|
||
This is very similar to the :meth:`~gensim.models.ldaseqmodel.LdaPost.update_gamma` method and uses
|
||
the same formula.
|
||
|
||
Parameters
|
||
----------
|
||
time : int
|
||
The time slice.
|
||
doc : list of (int, float)
|
||
Unused but kept here for backwards compatibility. The document set in the constructor (`self.doc`) is used
|
||
instead.
|
||
topic_suffstats : list of float
|
||
Sufficient statistics for each topic.
|
||
|
||
Returns
|
||
-------
|
||
list of float
|
||
The updated sufficient statistics for each topic.
|
||
|
||
"""
|
||
num_topics = self.lda.num_topics
|
||
|
||
for k in range(0, num_topics):
|
||
topic_ss = topic_suffstats[k]
|
||
n = 0
|
||
for word_id, count in self.doc:
|
||
topic_ss[word_id][time] += count * self.phi[n][k]
|
||
n += 1
|
||
topic_suffstats[k] = topic_ss
|
||
|
||
return topic_suffstats
|
||
|
||
|
||
# the following functions are used in update_obs as the objective function.
|
||
def f_obs(x, *args):
|
||
"""Function which we are optimising for minimizing obs.
|
||
|
||
Parameters
|
||
----------
|
||
x : list of float
|
||
The obs values for this word.
|
||
sslm : :class:`~gensim.models.ldaseqmodel.sslm`
|
||
The State Space Language Model for DTM.
|
||
word_counts : list of int
|
||
Total word counts for each time slice.
|
||
totals : list of int of length `len(self.time_slice)`
|
||
The totals for each time slice.
|
||
mean_deriv_mtx : list of float
|
||
Mean derivative for each time slice.
|
||
word : int
|
||
The word's ID.
|
||
deriv : list of float
|
||
Mean derivative for each time slice.
|
||
|
||
Returns
|
||
-------
|
||
list of float
|
||
The value of the objective function evaluated at point `x`.
|
||
|
||
"""
|
||
sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args
|
||
# flag
|
||
init_mult = 1000
|
||
|
||
T = len(x)
|
||
val = 0
|
||
term1 = 0
|
||
term2 = 0
|
||
|
||
# term 3 and 4 for DIM
|
||
term3 = 0
|
||
term4 = 0
|
||
|
||
sslm.obs[word] = x
|
||
sslm.mean[word], sslm.fwd_mean[word] = sslm.compute_post_mean(word, sslm.chain_variance)
|
||
|
||
mean = sslm.mean[word]
|
||
variance = sslm.variance[word]
|
||
|
||
# only used for DIM mode
|
||
# w_phi_l = sslm.w_phi_l[word]
|
||
# m_update_coeff = sslm.m_update_coeff[word]
|
||
|
||
for t in range(1, T + 1):
|
||
mean_t = mean[t]
|
||
mean_t_prev = mean[t - 1]
|
||
|
||
val = mean_t - mean_t_prev
|
||
term1 += val * val
|
||
term2 += word_counts[t - 1] * mean_t - totals[t - 1] * np.exp(mean_t + variance[t] / 2) / sslm.zeta[t - 1]
|
||
|
||
model = "DTM"
|
||
if model == "DIM":
|
||
# stuff happens
|
||
pass
|
||
|
||
if sslm.chain_variance > 0.0:
|
||
|
||
term1 = - (term1 / (2 * sslm.chain_variance))
|
||
term1 = term1 - mean[0] * mean[0] / (2 * init_mult * sslm.chain_variance)
|
||
else:
|
||
term1 = 0.0
|
||
|
||
final = -(term1 + term2 + term3 + term4)
|
||
|
||
return final
|
||
|
||
|
||
def df_obs(x, *args):
|
||
"""Derivative of the objective function which optimises obs.
|
||
|
||
Parameters
|
||
----------
|
||
x : list of float
|
||
The obs values for this word.
|
||
sslm : :class:`~gensim.models.ldaseqmodel.sslm`
|
||
The State Space Language Model for DTM.
|
||
word_counts : list of int
|
||
Total word counts for each time slice.
|
||
totals : list of int of length `len(self.time_slice)`
|
||
The totals for each time slice.
|
||
mean_deriv_mtx : list of float
|
||
Mean derivative for each time slice.
|
||
word : int
|
||
The word's ID.
|
||
deriv : list of float
|
||
Mean derivative for each time slice.
|
||
|
||
Returns
|
||
-------
|
||
list of float
|
||
The derivative of the objective function evaluated at point `x`.
|
||
|
||
"""
|
||
sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args
|
||
|
||
sslm.obs[word] = x
|
||
sslm.mean[word], sslm.fwd_mean[word] = sslm.compute_post_mean(word, sslm.chain_variance)
|
||
|
||
model = "DTM"
|
||
if model == "DTM":
|
||
deriv = sslm.compute_obs_deriv(word, word_counts, totals, mean_deriv_mtx, deriv)
|
||
elif model == "DIM":
|
||
deriv = sslm.compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, deriv) # noqa:F821
|
||
|
||
return np.negative(deriv)
|