laywerrobot/lib/python3.6/site-packages/gensim/models/atmodel.py

1186 lines
51 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2016 Radim Rehurek <radimrehurek@seznam.cz>
# Copyright (C) 2016 Olavur Mortensen <olavurmortensen@gmail.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Author-topic model.
This module trains the author-topic model on documents and corresponding author-document dictionaries.
The training is online and is constant in memory w.r.t. the number of documents.
The model is *not* constant in memory w.r.t. the number of authors.
The model can be updated with additional documents after training has been completed. It is
also possible to continue training on the existing data.
The model is closely related to :class:`~gensim.models.ldamodel.LdaModel`.
The :class:`~gensim.models.atmodel.AuthorTopicModel` class inherits :class:`~gensim.models.ldamodel.LdaModel`,
and its usage is thus similar.
The model was introduced by `Rosen-Zvi and co-authors: "The Author-Topic Model for Authors and Documents"
<https://arxiv.org/abs/1207.4169>`_. The model correlates the authorship information with the topics to give a better
insight on the subject knowledge of an author.
Example
-------
>>> from gensim.models import AuthorTopicModel
>>> from gensim.corpora import mmcorpus
>>> from gensim.test.utils import common_dictionary, datapath, temporary_file
>>> author2doc = {
... 'john': [0, 1, 2, 3, 4, 5, 6],
... 'jane': [2, 3, 4, 5, 6, 7, 8],
... 'jack': [0, 2, 4, 6, 8]
... }
>>>
>>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
>>>
>>> with temporary_file("serialized") as s_path:
... model = AuthorTopicModel(
... corpus, author2doc=author2doc, id2word=common_dictionary, num_topics=4,
... serialized=True, serialization_path=s_path
... )
...
... model.update(corpus, author2doc) # update the author-topic model with additional documents
>>>
>>> # construct vectors for authors
>>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]
"""
# TODO: this class inherits LdaModel and overwrites some methods. There is some code
# duplication still, and a refactor could be made to avoid this. Comments with "TODOs"
# are included in the code where this is the case, for example in the log_perplexity
# and do_estep methods.
import logging
import numpy as np # for arrays, array broadcasting etc.
from copy import deepcopy
from shutil import copyfile
from os.path import isfile
from os import remove
from gensim import utils
from gensim.models import LdaModel
from gensim.models.ldamodel import LdaState
from gensim.matutils import dirichlet_expectation
from gensim.corpora import MmCorpus
from itertools import chain
from scipy.special import gammaln # gamma function utils
from six.moves import xrange
import six
logger = logging.getLogger(__name__)
class AuthorTopicState(LdaState):
"""Encapsulate information for computation of :class:`~gensim.models.atmodel.AuthorTopicModel`."""
def __init__(self, eta, lambda_shape, gamma_shape):
"""
Parameters
----------
eta: numpy.ndarray
Dirichlet topic parameter for sparsity.
lambda_shape: (int, int)
Initialize topic parameters.
gamma_shape: int
Initialize topic parameters.
"""
self.eta = eta
self.sstats = np.zeros(lambda_shape)
self.gamma = np.zeros(gamma_shape)
self.numdocs = 0
self.dtype = np.float64 # To be compatible with LdaState
def construct_doc2author(corpus, author2doc):
"""Create a mapping from document IDs to author IDs.
Parameters
----------
corpus: iterable of list of (int, float)
Corpus in BoW format.
author2doc: dict of (str, list of int)
Mapping of authors to documents.
Returns
-------
dict of (int, list of str)
Document to Author mapping.
"""
doc2author = {}
for d, _ in enumerate(corpus):
author_ids = []
for a, a_doc_ids in author2doc.items():
if d in a_doc_ids:
author_ids.append(a)
doc2author[d] = author_ids
return doc2author
def construct_author2doc(doc2author):
"""Make a mapping from author IDs to document IDs.
Parameters
----------
doc2author: dict of (int, list of str)
Mapping of document id to authors.
Returns
-------
dict of (str, list of int)
Mapping of authors to document ids.
"""
# First get a set of all authors.
authors_ids = set()
for d, a_doc_ids in doc2author.items():
for a in a_doc_ids:
authors_ids.add(a)
# Now construct the dictionary.
author2doc = {}
for a in authors_ids:
author2doc[a] = []
for d, a_ids in doc2author.items():
if a in a_ids:
author2doc[a].append(d)
return author2doc
class AuthorTopicModel(LdaModel):
"""The constructor estimates the author-topic model parameters based on a training corpus."""
def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, doc2author=None,
chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0,
alpha='symmetric', eta='symmetric', update_every=1, eval_every=10,
gamma_threshold=0.001, serialized=False, serialization_path=None,
minimum_probability=0.01, random_state=None):
"""
Parameters
----------
corpus : iterable of list of (int, float), optional
Corpus in BoW format
num_topics : int, optional
Number of topics to be extracted from the training corpus.
id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
A mapping from word ids (integers) to words (strings).
author2doc : dict of (str, list of int), optional
A dictionary where keys are the names of authors and values are lists of document IDs that the author
contributes to.
doc2author : dict of (int, list of str), optional
A dictionary where the keys are document IDs and the values are lists of author names.
chunksize : int, optional
Controls the size of the mini-batches.
passes : int, optional
Number of times the model makes a pass over the entire training data.
iterations : int, optional
Maximum number of times the model loops over each document.
decay : float, optional
Controls how old documents are forgotten.
offset : float, optional
Controls down-weighting of iterations.
alpha : float, optional
Hyperparameters for author-topic model.Supports special values of 'asymmetric'
and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior,
the latter learns an asymmetric prior directly from your data.
eta : float, optional
Hyperparameters for author-topic model.
update_every : int, optional
Make updates in topic probability for latest mini-batch.
eval_every : int, optional
Calculate and estimate log perplexity for latest mini-batch.
gamma_threshold : float, optional
Threshold value of gamma(topic difference between consecutive two topics)
until which the iterations continue.
serialized : bool, optional
Indicates whether the input corpora to the model are simple lists
or saved to the hard-drive.
serialization_path : str, optional
Must be set to a filepath, if `serialized = True` is used.
minimum_probability : float, optional
Controls filtering the topics returned for a document (bow).
random_state : {int, numpy.random.RandomState}, optional
Set the state of the random number generator inside the author-topic model.
"""
# NOTE: this doesn't call constructor of a base class, but duplicates most of this code
# so we have to set dtype to float64 default here
self.dtype = np.float64
# NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the
# infrastructure to implement a distributed author-topic model is already in place,
# such as the AuthorTopicState.
distributed = False
self.dispatcher = None
self.numworkers = 1
self.id2word = id2word
if corpus is None and self.id2word is None:
raise ValueError(
"at least one of corpus/id2word must be specified, to establish input space dimensionality"
)
if self.id2word is None:
logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
self.id2word = utils.dict_from_corpus(corpus)
self.num_terms = len(self.id2word)
elif len(self.id2word) > 0:
self.num_terms = 1 + max(self.id2word.keys())
else:
self.num_terms = 0
if self.num_terms == 0:
raise ValueError("cannot compute the author-topic model over an empty collection (no terms)")
logger.info('Vocabulary consists of %d words.', self.num_terms)
self.author2doc = {}
self.doc2author = {}
self.distributed = distributed
self.num_topics = num_topics
self.num_authors = 0
self.chunksize = chunksize
self.decay = decay
self.offset = offset
self.minimum_probability = minimum_probability
self.num_updates = 0
self.total_docs = 0
self.passes = passes
self.update_every = update_every
self.eval_every = eval_every
self.author2id = {}
self.id2author = {}
self.serialized = serialized
if serialized and not serialization_path:
raise ValueError(
"If serialized corpora are used, a the path to a folder "
"where the corpus should be saved must be provided (serialized_path)."
)
if serialized and serialization_path:
assert not isfile(serialization_path), \
"A file already exists at the serialization_path path; " \
"choose a different serialization_path, or delete the file."
self.serialization_path = serialization_path
# Initialize an empty self.corpus.
self.init_empty_corpus()
self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')
assert self.alpha.shape == (self.num_topics,), \
"Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)
if isinstance(eta, six.string_types):
if eta == 'asymmetric':
raise ValueError("The 'asymmetric' option cannot be used for eta")
self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')
self.random_state = utils.get_random_state(random_state)
assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), (
"Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
(str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)
)
# VB constants
self.iterations = iterations
self.gamma_threshold = gamma_threshold
# Initialize the variational distributions q(beta|lambda) and q(theta|gamma)
self.state = AuthorTopicState(self.eta, (self.num_topics, self.num_terms), (self.num_authors, self.num_topics))
self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))
# if a training corpus was provided, start estimating the model right away
if corpus is not None and (author2doc is not None or doc2author is not None):
use_numpy = self.dispatcher is not None
self.update(corpus, author2doc, doc2author, chunks_as_numpy=use_numpy)
def __str__(self):
"""Get a string representation of object.
Returns
-------
str
String representation of current instance.
"""
return "AuthorTopicModel(num_terms=%s, num_topics=%s, num_authors=%s, decay=%s, chunksize=%s)" % \
(self.num_terms, self.num_topics, self.num_authors, self.decay, self.chunksize)
def init_empty_corpus(self):
"""Initialize an empty corpus.
If the corpora are to be treated as lists, simply initialize an empty list.
If serialization is used, initialize an empty corpus using :class:`~gensim.corpora.mmcorpus.MmCorpus`.
"""
if self.serialized:
# Initialize the corpus as a serialized empty list.
# This corpus will be extended in self.update.
MmCorpus.serialize(self.serialization_path, []) # Serialize empty corpus.
self.corpus = MmCorpus(self.serialization_path) # Store serialized corpus object in self.corpus.
else:
# All input corpora are assumed to just be lists.
self.corpus = []
def extend_corpus(self, corpus):
"""Add new documents from `corpus` to `self.corpus`.
If serialization is used, then the entire corpus (`self.corpus`) is re-serialized and the new documents
are added in the process. If serialization is not used, the corpus, as a list of documents, is simply extended.
Parameters
----------
corpus : iterable of list of (int, float)
Corpus in BoW format
Raises
------
AssertionError
If serialized == False and corpus isn't list.
"""
if self.serialized:
# Re-serialize the entire corpus while appending the new documents.
if isinstance(corpus, MmCorpus):
# Check that we are not attempting to overwrite the serialized corpus.
assert self.corpus.input != corpus.input, \
'Input corpus cannot have the same file path as the model corpus (serialization_path).'
corpus_chain = chain(self.corpus, corpus) # A generator with the old and new documents.
# Make a temporary copy of the file where the corpus is serialized.
copyfile(self.serialization_path, self.serialization_path + '.tmp')
self.corpus.input = self.serialization_path + '.tmp' # Point the old corpus at this temporary file.
# Re-serialize the old corpus, and extend it with the new corpus.
MmCorpus.serialize(self.serialization_path, corpus_chain)
self.corpus = MmCorpus(self.serialization_path) # Store the new serialized corpus object in self.corpus.
remove(self.serialization_path + '.tmp') # Remove the temporary file again.
else:
# self.corpus and corpus are just lists, just extend the list.
# First check that corpus is actually a list.
assert isinstance(corpus, list), "If serialized == False, all input corpora must be lists."
self.corpus.extend(corpus)
def compute_phinorm(self, expElogthetad, expElogbetad):
"""Efficiently computes the normalizing factor in phi.
Parameters
----------
expElogthetad: numpy.ndarray
Value of variational distribution :math:`q(\theta|\gamma)`.
expElogbetad: numpy.ndarray
Value of variational distribution :math:`q(\\beta|\lambda)`.
Returns
-------
float
Value of normalizing factor.
"""
expElogtheta_sum = expElogthetad.sum(axis=0)
phinorm = expElogtheta_sum.dot(expElogbetad) + 1e-100
return phinorm
def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, chunk_doc_idx=None):
"""Give a `chunk` of sparse document vectors, update gamma for each author corresponding to the `chuck`.
Warnings
--------
The whole input chunk of document is assumed to fit in RAM, chunking of a large corpus must be done earlier
in the pipeline.
Avoids computing the `phi` variational parameter directly using the
optimization presented in `Lee, Seung: "Algorithms for non-negative matrix factorization", NIPS 2001
<https://papers.nips.cc/paper/1861-algorithms-for-non-negative-matrix-factorization.pdf>`_.
Parameters
----------
chunk : iterable of list of (int, float)
Corpus in BoW format.
author2doc : dict of (str, list of int), optional
A dictionary where keys are the names of authors and values are lists of document IDs that the author
contributes to.
doc2author : dict of (int, list of str), optional
A dictionary where the keys are document IDs and the values are lists of author names.
rhot : float
Value of rho for conducting inference on documents.
collect_sstats : boolean, optional
If True - collect sufficient statistics needed to update the model's topic-word distributions, and return
`(gamma_chunk, sstats)`. Otherwise, return `(gamma_chunk, None)`. `gamma_chunk` is of shape
`len(chunk_authors) x self.num_topics`,where `chunk_authors` is the number of authors in the documents in
the current chunk.
chunk_doc_idx : numpy.ndarray, optional
Assigns the value for document index.
Returns
-------
(numpy.ndarray, numpy.ndarray)
gamma_chunk and sstats (if `collect_sstats == True`, otherwise - None)
"""
try:
len(chunk)
except TypeError:
# convert iterators/generators to plain list, so we have len() etc.
chunk = list(chunk)
if len(chunk) > 1:
logger.debug("performing inference on a chunk of %i documents", len(chunk))
# Initialize the variational distribution q(theta|gamma) for the chunk
if collect_sstats:
sstats = np.zeros_like(self.expElogbeta)
else:
sstats = None
converged = 0
# Stack all the computed gammas into this output array.
gamma_chunk = np.zeros((0, self.num_topics))
# Now, for each document d update gamma and phi w.r.t. all authors in those documents.
for d, doc in enumerate(chunk):
if chunk_doc_idx is not None:
doc_no = chunk_doc_idx[d]
else:
doc_no = d
# Get the IDs and counts of all the words in the current document.
# TODO: this is duplication of code in LdaModel. Refactor.
if doc and not isinstance(doc[0][0], six.integer_types + (np.integer,)):
# make sure the term IDs are ints, otherwise np will get upset
ids = [int(idx) for idx, _ in doc]
else:
ids = [idx for idx, _ in doc]
cts = np.array([cnt for _, cnt in doc])
# Get all authors in current document, and convert the author names to integer IDs.
authors_d = [self.author2id[a] for a in self.doc2author[doc_no]]
gammad = self.state.gamma[authors_d, :] # gamma of document d before update.
tilde_gamma = gammad.copy() # gamma that will be updated.
# Compute the expectation of the log of the Dirichlet parameters theta and beta.
Elogthetad = dirichlet_expectation(tilde_gamma)
expElogthetad = np.exp(Elogthetad)
expElogbetad = self.expElogbeta[:, ids]
# Compute the normalizing constant of phi for the current document.
phinorm = self.compute_phinorm(expElogthetad, expElogbetad)
# Iterate between gamma and phi until convergence
for _ in xrange(self.iterations):
lastgamma = tilde_gamma.copy()
# Update gamma.
# phi is computed implicitly below,
for ai, a in enumerate(authors_d):
tilde_gamma[ai, :] = self.alpha + len(self.author2doc[self.id2author[a]])\
* expElogthetad[ai, :] * np.dot(cts / phinorm, expElogbetad.T)
# Update gamma.
# Interpolation between document d's "local" gamma (tilde_gamma),
# and "global" gamma (gammad).
tilde_gamma = (1 - rhot) * gammad + rhot * tilde_gamma
# Update Elogtheta and Elogbeta, since gamma and lambda have been updated.
Elogthetad = dirichlet_expectation(tilde_gamma)
expElogthetad = np.exp(Elogthetad)
# Update the normalizing constant in phi.
phinorm = self.compute_phinorm(expElogthetad, expElogbetad)
# Check for convergence.
# Criterion is mean change in "local" gamma.
meanchange_gamma = np.mean(abs(tilde_gamma - lastgamma))
gamma_condition = meanchange_gamma < self.gamma_threshold
if gamma_condition:
converged += 1
break
# End of iterations loop.
# Store the updated gammas in the model state.
self.state.gamma[authors_d, :] = tilde_gamma
# Stack the new gammas into the output array.
gamma_chunk = np.vstack([gamma_chunk, tilde_gamma])
if collect_sstats:
# Contribution of document d to the expected sufficient
# statistics for the M step.
expElogtheta_sum_a = expElogthetad.sum(axis=0)
sstats[:, ids] += np.outer(expElogtheta_sum_a.T, cts / phinorm)
if len(chunk) > 1:
logger.debug(
"%i/%i documents converged within %i iterations",
converged, len(chunk), self.iterations
)
if collect_sstats:
# This step finishes computing the sufficient statistics for the
# M step, so that
# sstats[k, w] = \sum_d n_{dw} * \sum_a phi_{dwak}
# = \sum_d n_{dw} * exp{Elogtheta_{ak} + Elogbeta_{kw}} / phinorm_{dw}.
sstats *= self.expElogbeta
return gamma_chunk, sstats
def do_estep(self, chunk, author2doc, doc2author, rhot, state=None, chunk_doc_idx=None):
"""Performs inference (E-step) on a chunk of documents, and accumulate the collected sufficient statistics.
Parameters
----------
chunk : iterable of list of (int, float)
Corpus in BoW format.
author2doc : dict of (str, list of int), optional
A dictionary where keys are the names of authors and values are lists of document IDs that the author
contributes to.
doc2author : dict of (int, list of str), optional
A dictionary where the keys are document IDs and the values are lists of author names.
rhot : float
Value of rho for conducting inference on documents.
state : int, optional
Initializes the state for a new E iteration.
chunk_doc_idx : numpy.ndarray, optional
Assigns the value for document index.
Returns
-------
float
Value of gamma for training of model.
"""
# TODO: this method is somewhat similar to the one in LdaModel. Refactor if possible.
if state is None:
state = self.state
gamma, sstats = self.inference(
chunk, author2doc, doc2author, rhot,
collect_sstats=True, chunk_doc_idx=chunk_doc_idx
)
state.sstats += sstats
state.numdocs += len(chunk)
return gamma
def log_perplexity(self, chunk, chunk_doc_idx=None, total_docs=None):
"""Calculate per-word likelihood bound, using the `chunk` of documents as evaluation corpus.
Parameters
----------
chunk : iterable of list of (int, float)
Corpus in BoW format.
chunk_doc_idx : numpy.ndarray, optional
Assigns the value for document index.
total_docs : int, optional
Initializes the value for total number of documents.
Returns
-------
float
Value of per-word likelihood bound.
"""
# TODO: This method is very similar to the one in LdaModel. Refactor.
if total_docs is None:
total_docs = len(chunk)
corpus_words = sum(cnt for document in chunk for _, cnt in document)
subsample_ratio = 1.0 * total_docs / len(chunk)
perwordbound = self.bound(chunk, chunk_doc_idx, subsample_ratio=subsample_ratio) / \
(subsample_ratio * corpus_words)
logger.info(
"%.3f per-word bound, %.1f perplexity estimate based on a corpus of %i documents with %i words",
perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words
)
return perwordbound
def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, decay=None, offset=None,
passes=None, update_every=None, eval_every=None, iterations=None,
gamma_threshold=None, chunks_as_numpy=False):
"""Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the
maximum number of allowed iterations is reached).
Notes
-----
This update also supports updating an already trained model (self)
with new documents from `corpus`: the two models are then merged in proportion to the number of old vs. new
documents. This feature is still experimental for non-stationary input streams.
For stationary input (no topic drift in new documents), on the other hand, this equals the online update of
`Hoffman et al. Stochastic Variational Inference
<http://www.jmlr.org/papers/volume14/hoffman13a/hoffman13a.pdf>`_ and is guaranteed to converge for any `decay`
in (0.5, 1.0>. Additionally, for smaller `corpus` sizes, an increasing `offset` may be beneficial (see
Table 1 in Hoffman et al.)
If update is called with authors that already exist in the model, it will resume training on not only new
documents for that author, but also the previously seen documents. This is necessary for those authors' topic
distributions to converge.
Every time `update(corpus, author2doc)` is called, the new documents are to appended to all the previously seen
documents, and author2doc is combined with the previously seen authors.
To resume training on all the data seen by the model, simply call
:meth:`~gensim.models.atmodel.AuthorTopicModel.update`.
It is not possible to add new authors to existing documents, as all documents in `corpus` are assumed to be
new documents.
Parameters
----------
corpus : iterable of list of (int, float)
The corpus in BoW format.
author2doc : dict of (str, list of int), optional
A dictionary where keys are the names of authors and values are lists of document IDs that the author
contributes to.
doc2author : dict of (int, list of str), optional
A dictionary where the keys are document IDs and the values are lists of author names.
chunksize : int, optional
Controls the size of the mini-batches.
decay : float, optional
Controls how old documents are forgotten.
offset : float, optional
Controls down-weighting of iterations.
passes : int, optional
Number of times the model makes a pass over the entire training data.
update_every : int, optional
Make updates in topic probability for latest mini-batch.
eval_every : int, optional
Calculate and estimate log perplexity for latest mini-batch.
iterations : int, optional
Maximum number of times the model loops over each document
gamma_threshold : float, optional
Threshold value of gamma(topic difference between consecutive two topics)
until which the iterations continue.
chunks_as_numpy : bool, optional
Whether each chunk passed to :meth:`~gensim.models.atmodel.AuthorTopicModel.inference` should be a numpy
array of not. Numpy can in some settings turn the term IDs into floats, these will be converted back into
integers in inference, which incurs a performance hit. For distributed computing (not supported now)
it may be desirable to keep the chunks as numpy arrays.
"""
# use parameters given in constructor, unless user explicitly overrode them
if decay is None:
decay = self.decay
if offset is None:
offset = self.offset
if passes is None:
passes = self.passes
if update_every is None:
update_every = self.update_every
if eval_every is None:
eval_every = self.eval_every
if iterations is None:
iterations = self.iterations
if gamma_threshold is None:
gamma_threshold = self.gamma_threshold
# TODO: if deepcopy is not used here, something goes wrong. When unit tests are run (specifically "testPasses"),
# the process simply gets killed.
author2doc = deepcopy(author2doc)
doc2author = deepcopy(doc2author)
# TODO: it is not possible to add new authors to an existing document (all input documents are treated
# as completely new documents). Perhaps this functionality could be implemented.
# If it's absolutely necessary, the user can delete the documents that have new authors, and call update
# on them with the new and old authors.
if corpus is None:
# Just keep training on the already available data.
# Assumes self.update() has been called before with input documents and corresponding authors.
assert self.total_docs > 0, 'update() was called with no documents to train on.'
train_corpus_idx = [d for d in xrange(self.total_docs)]
num_input_authors = len(self.author2doc)
else:
if doc2author is None and author2doc is None:
raise ValueError(
'at least one of author2doc/doc2author must be specified, to establish input space dimensionality'
)
# If either doc2author or author2doc is missing, construct them from the other.
if doc2author is None:
doc2author = construct_doc2author(corpus, author2doc)
elif author2doc is None:
author2doc = construct_author2doc(doc2author)
# Number of authors that need to be updated.
num_input_authors = len(author2doc)
try:
len_input_corpus = len(corpus)
except TypeError:
logger.warning("input corpus stream has no len(); counting documents")
len_input_corpus = sum(1 for _ in corpus)
if len_input_corpus == 0:
logger.warning("AuthorTopicModel.update() called with an empty corpus")
return
self.total_docs += len_input_corpus
# Add new documents in corpus to self.corpus.
self.extend_corpus(corpus)
# Obtain a list of new authors.
new_authors = []
# Sorting the author names makes the model more reproducible.
for a in sorted(author2doc.keys()):
if not self.author2doc.get(a):
new_authors.append(a)
num_new_authors = len(new_authors)
# Add new authors do author2id/id2author dictionaries.
for a_id, a_name in enumerate(new_authors):
self.author2id[a_name] = a_id + self.num_authors
self.id2author[a_id + self.num_authors] = a_name
# Increment the number of total authors seen.
self.num_authors += num_new_authors
# Initialize the variational distributions q(theta|gamma)
gamma_new = self.random_state.gamma(100., 1. / 100., (num_new_authors, self.num_topics))
self.state.gamma = np.vstack([self.state.gamma, gamma_new])
# Combine author2doc with self.author2doc.
# First, increment the document IDs by the number of previously seen documents.
for a, doc_ids in author2doc.items():
doc_ids = [d + self.total_docs - len_input_corpus for d in doc_ids]
# For all authors in the input corpus, add the new documents.
for a, doc_ids in author2doc.items():
if self.author2doc.get(a):
# This is not a new author, append new documents.
self.author2doc[a].extend(doc_ids)
else:
# This is a new author, create index.
self.author2doc[a] = doc_ids
# Add all new documents to self.doc2author.
for d, a_list in doc2author.items():
self.doc2author[d] = a_list
# Train on all documents of authors in input_corpus.
train_corpus_idx = []
for _ in author2doc.keys(): # For all authors in input corpus.
for doc_ids in self.author2doc.values(): # For all documents in total corpus.
train_corpus_idx.extend(doc_ids)
# Make the list of training documents unique.
train_corpus_idx = list(set(train_corpus_idx))
# train_corpus_idx is only a list of indexes, so "len" is valid.
lencorpus = len(train_corpus_idx)
if chunksize is None:
chunksize = min(lencorpus, self.chunksize)
self.state.numdocs += lencorpus
if update_every:
updatetype = "online"
updateafter = min(lencorpus, update_every * self.numworkers * chunksize)
else:
updatetype = "batch"
updateafter = lencorpus
evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize)
updates_per_pass = max(1, lencorpus / updateafter)
logger.info(
"running %s author-topic training, %s topics, %s authors, "
"%i passes over the supplied corpus of %i documents, updating model once "
"every %i documents, evaluating perplexity every %i documents, "
"iterating %ix with a convergence threshold of %f",
updatetype, self.num_topics, num_input_authors, passes, lencorpus, updateafter,
evalafter, iterations, gamma_threshold
)
if updates_per_pass * passes < 10:
logger.warning(
"too few updates, training might not converge; "
"consider increasing the number of passes or iterations to improve accuracy"
)
# rho is the "speed" of updating; TODO try other fncs
# pass_ + num_updates handles increasing the starting t for each pass,
# while allowing it to "reset" on the first pass of each update
def rho():
return pow(offset + pass_ + (self.num_updates / chunksize), -decay)
for pass_ in xrange(passes):
if self.dispatcher:
logger.info('initializing %s workers', self.numworkers)
self.dispatcher.reset(self.state)
else:
# gamma is not needed in "other", thus its shape is (0, 0).
other = AuthorTopicState(self.eta, self.state.sstats.shape, (0, 0))
dirty = False
reallen = 0
for chunk_no, chunk_doc_idx in enumerate(
utils.grouper(train_corpus_idx, chunksize, as_numpy=chunks_as_numpy)):
chunk = [self.corpus[d] for d in chunk_doc_idx]
reallen += len(chunk) # keep track of how many documents we've processed so far
if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
# log_perplexity requires the indexes of the documents being evaluated, to know what authors
# correspond to the documents.
self.log_perplexity(chunk, chunk_doc_idx, total_docs=lencorpus)
if self.dispatcher:
# add the chunk to dispatcher's job queue, so workers can munch on it
logger.info(
"PROGRESS: pass %i, dispatching documents up to #%i/%i",
pass_, chunk_no * chunksize + len(chunk), lencorpus
)
# this will eventually block until some jobs finish, because the queue has a small finite length
self.dispatcher.putjob(chunk)
else:
logger.info(
"PROGRESS: pass %i, at document #%i/%i",
pass_, chunk_no * chunksize + len(chunk), lencorpus
)
# do_estep requires the indexes of the documents being trained on, to know what authors
# correspond to the documents.
gammat = self.do_estep(chunk, self.author2doc, self.doc2author, rho(), other, chunk_doc_idx)
if self.optimize_alpha:
self.update_alpha(gammat, rho())
dirty = True
del chunk
# perform an M step. determine when based on update_every, don't do this after every chunk
if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0:
if self.dispatcher:
# distributed mode: wait for all workers to finish
logger.info("reached the end of input; now waiting for all remaining jobs to finish")
other = self.dispatcher.getstate()
self.do_mstep(rho(), other, pass_ > 0)
del other # frees up memory
if self.dispatcher:
logger.info('initializing workers')
self.dispatcher.reset(self.state)
else:
other = AuthorTopicState(self.eta, self.state.sstats.shape, (0, 0))
dirty = False
# endfor single corpus iteration
if reallen != lencorpus:
raise RuntimeError("input corpus size changed during training (don't use generators as input)")
if dirty:
# finish any remaining updates
if self.dispatcher:
# distributed mode: wait for all workers to finish
logger.info("reached the end of input; now waiting for all remaining jobs to finish")
other = self.dispatcher.getstate()
self.do_mstep(rho(), other, pass_ > 0)
del other
def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, doc2author=None):
"""Estimate the variational bound of documents from `corpus`.
:math:`\mathbb{E_{q}}[\log p(corpus)] - \mathbb{E_{q}}[\log q(corpus)]`
Notes
-----
There are basically two use cases of this method:
#. `chunk` is a subset of the training corpus, and `chunk_doc_idx` is provided,
indicating the indexes of the documents in the training corpus.
#. `chunk` is a test set (held-out data), and `author2doc` and `doc2author` corresponding to this test set
are provided. There must not be any new authors passed to this method, `chunk_doc_idx` is not needed
in this case.
Parameters
----------
chunk : iterable of list of (int, float)
Corpus in BoW format.
chunk_doc_idx : numpy.ndarray, optional
Assigns the value for document index.
subsample_ratio : float, optional
Used for calculation of word score for estimation of variational bound.
author2doc : dict of (str, list of int), optinal
A dictionary where keys are the names of authors and values are lists of documents that the author
contributes to.
doc2author : dict of (int, list of str), optional
A dictionary where the keys are document IDs and the values are lists of author names.
Returns
-------
float
Value of variational bound score.
"""
# TODO: enable evaluation of documents with new authors. One could, for example, make it
# possible to pass a list of documents to self.inference with no author dictionaries,
# assuming all the documents correspond to one (unseen) author, learn the author's
# gamma, and return gamma (without adding it to self.state.gamma). Of course,
# collect_sstats should be set to false, so that the model is not updated w.r.t. these
# new documents.
_lambda = self.state.get_lambda()
Elogbeta = dirichlet_expectation(_lambda)
expElogbeta = np.exp(Elogbeta)
gamma = self.state.gamma
if author2doc is None and doc2author is None:
# Evaluating on training documents (chunk of self.corpus).
author2doc = self.author2doc
doc2author = self.doc2author
if not chunk_doc_idx:
# If author2doc and doc2author are not provided, chunk is assumed to be a subset of
# self.corpus, and chunk_doc_idx is thus required.
raise ValueError(
'Either author dictionaries or chunk_doc_idx must be provided. '
'Consult documentation of bound method.'
)
elif author2doc is not None and doc2author is not None:
# Training on held-out documents (documents not seen during training).
# All authors in dictionaries must still be seen during training.
for a in author2doc.keys():
if not self.author2doc.get(a):
raise ValueError('bound cannot be called with authors not seen during training.')
if chunk_doc_idx:
raise ValueError(
'Either author dictionaries or chunk_doc_idx must be provided, not both. '
'Consult documentation of bound method.'
)
else:
raise ValueError(
'Either both author2doc and doc2author should be provided, or neither. '
'Consult documentation of bound method.'
)
Elogtheta = dirichlet_expectation(gamma)
expElogtheta = np.exp(Elogtheta)
word_score = 0.0
theta_score = 0.0
for d, doc in enumerate(chunk):
if chunk_doc_idx:
doc_no = chunk_doc_idx[d]
else:
doc_no = d
# Get all authors in current document, and convert the author names to integer IDs.
authors_d = [self.author2id[a] for a in self.doc2author[doc_no]]
ids = np.array([id for id, _ in doc]) # Word IDs in doc.
cts = np.array([cnt for _, cnt in doc]) # Word counts.
if d % self.chunksize == 0:
logger.debug("bound: at document #%i in chunk", d)
# Computing the bound requires summing over expElogtheta[a, k] * expElogbeta[k, v], which
# is the same computation as in normalizing phi.
phinorm = self.compute_phinorm(expElogtheta[authors_d, :], expElogbeta[:, ids])
word_score += np.log(1.0 / len(authors_d)) * sum(cts) + cts.dot(np.log(phinorm))
# Compensate likelihood for when `chunk` above is only a sample of the whole corpus. This ensures
# that the likelihood is always roughly on the same scale.
word_score *= subsample_ratio
# E[log p(theta | alpha) - log q(theta | gamma)]
for a in author2doc.keys():
a = self.author2id[a]
theta_score += np.sum((self.alpha - gamma[a, :]) * Elogtheta[a, :])
theta_score += np.sum(gammaln(gamma[a, :]) - gammaln(self.alpha))
theta_score += gammaln(np.sum(self.alpha)) - gammaln(np.sum(gamma[a, :]))
# theta_score is rescaled in a similar fashion.
# TODO: treat this in a more general way, similar to how it is done with word_score.
theta_score *= self.num_authors / len(author2doc)
# E[log p(beta | eta) - log q (beta | lambda)]
beta_score = 0.0
beta_score += np.sum((self.eta - _lambda) * Elogbeta)
beta_score += np.sum(gammaln(_lambda) - gammaln(self.eta))
sum_eta = np.sum(self.eta)
beta_score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1)))
total_score = word_score + theta_score + beta_score
return total_score
def get_document_topics(self, word_id, minimum_probability=None):
"""Override :meth:`~gensim.models.ldamodel.LdaModel.get_document_topics` and simply raises an exception.
Warnings
--------
This method invalid for model, use :meth:`~gensim.models.atmodel.AuthorTopicModel.get_author_topics` or
:meth:`~gensim.models.atmodel.AuthorTopicModel.get_new_author_topics` instead.
Raises
------
NotImplementedError
Always.
"""
raise NotImplementedError(
'Method "get_document_topics" is not valid for the author-topic model. '
'Use the "get_author_topics" method.'
)
def get_new_author_topics(self, corpus, minimum_probability=None):
"""Infers topics for new author.
Infers a topic distribution for a new author over the passed corpus of docs,
assuming that all documents are from this single new author.
Parameters
----------
corpus : iterable of list of (int, float)
Corpus in BoW format.
minimum_probability : float, optional
Ignore topics with probability below this value, if None - 1e-8 is used.
Returns
-------
list of (int, float)
Topic distribution for the given `corpus`.
"""
def rho():
return pow(self.offset + 1 + 1, -self.decay)
def rollback_new_author_chages():
self.state.gamma = self.state.gamma[0:-1]
del self.author2doc[new_author_name]
a_id = self.author2id[new_author_name]
del self.id2author[a_id]
del self.author2id[new_author_name]
for new_doc_id in corpus_doc_idx:
del self.doc2author[new_doc_id]
try:
len_input_corpus = len(corpus)
except TypeError:
logger.warning("input corpus stream has no len(); counting documents")
len_input_corpus = sum(1 for _ in corpus)
if len_input_corpus == 0:
raise ValueError("AuthorTopicModel.get_new_author_topics() called with an empty corpus")
new_author_name = "placeholder_name"
# indexes representing the documents in the input corpus
corpus_doc_idx = list(range(self.total_docs, self.total_docs + len_input_corpus))
# Add the new placeholder author to author2id/id2author dictionaries.
num_new_authors = 1
author_id = self.num_authors
if new_author_name in self.author2id:
raise ValueError("self.author2id already has 'placeholder_name' author")
self.author2id[new_author_name] = author_id
self.id2author[author_id] = new_author_name
# Add new author in author2doc and doc into doc2author.
self.author2doc[new_author_name] = corpus_doc_idx
for new_doc_id in corpus_doc_idx:
self.doc2author[new_doc_id] = [new_author_name]
gamma_new = self.random_state.gamma(100., 1. / 100., (num_new_authors, self.num_topics))
self.state.gamma = np.vstack([self.state.gamma, gamma_new])
# Should not record the sstats, as we are goint to delete the new author after calculated.
try:
gammat, _ = self.inference(
corpus, self.author2doc, self.doc2author, rho(),
collect_sstats=False, chunk_doc_idx=corpus_doc_idx
)
new_author_topics = self.get_author_topics(new_author_name, minimum_probability)
finally:
rollback_new_author_chages()
return new_author_topics
def get_author_topics(self, author_name, minimum_probability=None):
"""Get topic distribution the given author.
Parameters
----------
author_name : str
Name of the author for which the topic distribution needs to be estimated.
minimum_probability : float, optional
Sets the minimum probability value for showing the topics of a given author, topics with probability <
`minimum_probability` will be ignored.
Returns
-------
list of (int, float)
Topic distribution of an author.
Example
-------
>>> from gensim.models import AuthorTopicModel
>>> from gensim.corpora import mmcorpus
>>> from gensim.test.utils import common_dictionary, datapath, temporary_file
>>> author2doc = {
... 'john': [0, 1, 2, 3, 4, 5, 6],
... 'jane': [2, 3, 4, 5, 6, 7, 8],
... 'jack': [0, 2, 4, 6, 8]
... }
>>>
>>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
>>>
>>> with temporary_file("serialized") as s_path:
... model = AuthorTopicModel(
... corpus, author2doc=author2doc, id2word=common_dictionary, num_topics=4,
... serialized=True, serialization_path=s_path
... )
...
... model.update(corpus, author2doc) # update the author-topic model with additional documents
>>>
>>> # construct vectors for authors
>>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]
"""
author_id = self.author2id[author_name]
if minimum_probability is None:
minimum_probability = self.minimum_probability
minimum_probability = max(minimum_probability, 1e-8) # never allow zero values in sparse output
topic_dist = self.state.gamma[author_id, :] / sum(self.state.gamma[author_id, :])
author_topics = [
(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)
if topicvalue >= minimum_probability
]
return author_topics
def __getitem__(self, author_names, eps=None):
"""Get topic distribution for input `author_names`.
Parameters
----------
author_names : {str, list of str}
Name(s) of the author for which the topic distribution needs to be estimated.
eps : float, optional
The minimum probability value for showing the topics of a given author, topics with probability < `eps`
will be ignored.
Returns
-------
list of (int, float) **or** list of list of (int, float)
Topic distribution for the author(s), type depends on type of `author_names`.
"""
if isinstance(author_names, list):
items = []
for a in author_names:
items.append(self.get_author_topics(a, minimum_probability=eps))
else:
items = self.get_author_topics(author_names, minimum_probability=eps)
return items