laywerrobot/lib/python3.6/site-packages/gensim/models/atmodel.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2016 Radim Rehurek <radimrehurek@seznam.cz>
# Copyright (C) 2016 Olavur Mortensen <olavurmortensen@gmail.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""Author-topic model.

This module trains the author-topic model on documents and corresponding author-document dictionaries.
The training is online and is constant in memory w.r.t. the number of documents.
The model is *not* constant in memory w.r.t. the number of authors.

The model can be updated with additional documents after training has been completed. It is
also possible to continue training on the existing data.

The model is closely related to :class:`~gensim.models.ldamodel.LdaModel`.
The :class:`~gensim.models.atmodel.AuthorTopicModel` class inherits  :class:`~gensim.models.ldamodel.LdaModel`,
and its usage is thus similar.

The model was introduced by  `Rosen-Zvi and co-authors: "The Author-Topic Model for Authors and Documents"
<https://arxiv.org/abs/1207.4169>`_. The model correlates the authorship information with the topics to give a better
insight on the subject knowledge of an author.

Example
-------
>>> from gensim.models import AuthorTopicModel
>>> from gensim.corpora import mmcorpus
>>> from gensim.test.utils import common_dictionary, datapath, temporary_file

>>> author2doc = {
...     'john': [0, 1, 2, 3, 4, 5, 6],
...     'jane': [2, 3, 4, 5, 6, 7, 8],
...     'jack': [0, 2, 4, 6, 8]
... }
>>>
>>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
>>>
>>> with temporary_file("serialized") as s_path:
...     model = AuthorTopicModel(
...          corpus, author2doc=author2doc, id2word=common_dictionary, num_topics=4,
...          serialized=True, serialization_path=s_path
...     )
...
...     model.update(corpus, author2doc)  # update the author-topic model with additional documents
>>>
>>> # construct vectors for authors
>>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

"""
# TODO: this class inherits LdaModel and overwrites some methods. There is some code
# duplication still, and a refactor could be made to avoid this. Comments with "TODOs"
# are included in the code where this is the case, for example in the log_perplexity
# and do_estep methods.

import logging
import numpy as np  # for arrays, array broadcasting etc.
from copy import deepcopy
from shutil import copyfile
from os.path import isfile
from os import remove

from gensim import utils
from gensim.models import LdaModel
from gensim.models.ldamodel import LdaState
from gensim.matutils import dirichlet_expectation
from gensim.corpora import MmCorpus
from itertools import chain
from scipy.special import gammaln  # gamma function utils
from six.moves import xrange
import six

logger = logging.getLogger(__name__)


class AuthorTopicState(LdaState):
    """Encapsulate information for computation of :class:`~gensim.models.atmodel.AuthorTopicModel`."""

    def __init__(self, eta, lambda_shape, gamma_shape):
        """

        Parameters
        ----------
        eta: numpy.ndarray
            Dirichlet topic parameter for sparsity.
        lambda_shape: (int, int)
            Initialize topic parameters.
        gamma_shape: int
            Initialize topic parameters.

        """
        self.eta = eta
        self.sstats = np.zeros(lambda_shape)
        self.gamma = np.zeros(gamma_shape)
        self.numdocs = 0
        self.dtype = np.float64  # To be compatible with LdaState


def construct_doc2author(corpus, author2doc):
    """Create a mapping from document IDs to author IDs.

    Parameters
    ----------
    corpus: iterable of list of (int, float)
        Corpus in BoW format.
    author2doc: dict of (str, list of int)
        Mapping of authors to documents.

    Returns
    -------
    dict of (int, list of str)
        Document to Author mapping.

    """
    doc2author = {}
    for d, _ in enumerate(corpus):
        author_ids = []
        for a, a_doc_ids in author2doc.items():
            if d in a_doc_ids:
                author_ids.append(a)
        doc2author[d] = author_ids
    return doc2author


def construct_author2doc(doc2author):
    """Make a mapping from author IDs to document IDs.

    Parameters
    ----------
    doc2author: dict of (int, list of str)
        Mapping of document id to authors.

    Returns
    -------
    dict of (str, list of int)
        Mapping of authors to document ids.

    """
    # First get a set of all authors.
    authors_ids = set()
    for d, a_doc_ids in doc2author.items():
        for a in a_doc_ids:
            authors_ids.add(a)

    # Now construct the dictionary.
    author2doc = {}
    for a in authors_ids:
        author2doc[a] = []
        for d, a_ids in doc2author.items():
            if a in a_ids:
                author2doc[a].append(d)
    return author2doc


class AuthorTopicModel(LdaModel):
    """The constructor estimates the author-topic model parameters based on a training corpus."""

    def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, doc2author=None,
                 chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0,
                 alpha='symmetric', eta='symmetric', update_every=1, eval_every=10,
                 gamma_threshold=0.001, serialized=False, serialization_path=None,
                 minimum_probability=0.01, random_state=None):
        """

        Parameters
        ----------
        corpus : iterable of list of (int, float), optional
            Corpus in BoW format
        num_topics : int, optional
            Number of topics to be extracted from the training corpus.
        id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            A mapping from word ids (integers) to words (strings).
        author2doc : dict of (str, list of int), optional
            A dictionary where keys are the names of authors and values are lists of document IDs that the author
            contributes to.
        doc2author : dict of (int, list of str), optional
            A dictionary where the keys are document IDs and the values are lists of author names.
        chunksize : int, optional
            Controls the size of the mini-batches.
        passes : int, optional
            Number of times the model makes a pass over the entire training data.
        iterations : int, optional
            Maximum number of times the model loops over each document.
        decay : float, optional
            Controls how old documents are forgotten.
        offset : float, optional
            Controls down-weighting of iterations.
        alpha : float, optional
            Hyperparameters for author-topic model.Supports special values of 'asymmetric'
            and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior,
            the latter learns an asymmetric prior directly from your data.
        eta : float, optional
            Hyperparameters for author-topic model.
        update_every : int, optional
            Make updates in topic probability for latest mini-batch.
        eval_every : int, optional
            Calculate and estimate log perplexity for latest mini-batch.
        gamma_threshold : float, optional
            Threshold value of gamma(topic difference between consecutive two topics)
            until which the iterations continue.
        serialized : bool, optional
            Indicates whether the input corpora to the model are simple lists
            or saved to the hard-drive.
        serialization_path : str, optional
            Must be set to a filepath, if `serialized = True` is used.
        minimum_probability : float, optional
            Controls filtering the topics returned for a document (bow).
        random_state : {int, numpy.random.RandomState}, optional
            Set the state of the random number generator inside the author-topic model.

        """
        # NOTE: this doesn't call constructor of a base class, but duplicates most of this code
        # so we have to set dtype to float64 default here
        self.dtype = np.float64

        # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the
        # infrastructure to implement a distributed author-topic model is already in place,
        # such as the AuthorTopicState.
        distributed = False
        self.dispatcher = None
        self.numworkers = 1

        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError(
                "at least one of corpus/id2word must be specified, to establish input space dimensionality"
            )

        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError("cannot compute the author-topic model over an empty collection (no terms)")

        logger.info('Vocabulary consists of %d words.', self.num_terms)

        self.author2doc = {}
        self.doc2author = {}

        self.distributed = distributed
        self.num_topics = num_topics
        self.num_authors = 0
        self.chunksize = chunksize
        self.decay = decay
        self.offset = offset
        self.minimum_probability = minimum_probability
        self.num_updates = 0
        self.total_docs = 0

        self.passes = passes
        self.update_every = update_every
        self.eval_every = eval_every

        self.author2id = {}
        self.id2author = {}

        self.serialized = serialized
        if serialized and not serialization_path:
            raise ValueError(
                "If serialized corpora are used, a the path to a folder "
                "where the corpus should be saved must be provided (serialized_path)."
            )
        if serialized and serialization_path:
            assert not isfile(serialization_path), \
                "A file already exists at the serialization_path path; " \
                "choose a different serialization_path, or delete the file."
        self.serialization_path = serialization_path

        # Initialize an empty self.corpus.
        self.init_empty_corpus()

        self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')

        assert self.alpha.shape == (self.num_topics,), \
            "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)

        if isinstance(eta, six.string_types):
            if eta == 'asymmetric':
                raise ValueError("The 'asymmetric' option cannot be used for eta")

        self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

        self.random_state = utils.get_random_state(random_state)

        assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), (
            "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
            (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)
        )

        # VB constants
        self.iterations = iterations
        self.gamma_threshold = gamma_threshold

        # Initialize the variational distributions q(beta|lambda) and q(theta|gamma)
        self.state = AuthorTopicState(self.eta, (self.num_topics, self.num_terms), (self.num_authors, self.num_topics))
        self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
        self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None and (author2doc is not None or doc2author is not None):
            use_numpy = self.dispatcher is not None
            self.update(corpus, author2doc, doc2author, chunks_as_numpy=use_numpy)

    def __str__(self):
        """Get a string representation of object.

        Returns
        -------
        str
            String representation of current instance.

        """
        return "AuthorTopicModel(num_terms=%s, num_topics=%s, num_authors=%s, decay=%s, chunksize=%s)" % \
            (self.num_terms, self.num_topics, self.num_authors, self.decay, self.chunksize)

    def init_empty_corpus(self):
        """Initialize an empty corpus.
        If the corpora are to be treated as lists, simply initialize an empty list.
        If serialization is used, initialize an empty corpus using :class:`~gensim.corpora.mmcorpus.MmCorpus`.

        """
        if self.serialized:
            # Initialize the corpus as a serialized empty list.
            # This corpus will be extended in self.update.
            MmCorpus.serialize(self.serialization_path, [])  # Serialize empty corpus.
            self.corpus = MmCorpus(self.serialization_path)  # Store serialized corpus object in self.corpus.
        else:
            # All input corpora are assumed to just be lists.
            self.corpus = []

    def extend_corpus(self, corpus):
        """Add new documents from `corpus` to `self.corpus`.

        If serialization is used, then the entire corpus (`self.corpus`) is re-serialized and the new documents
        are added in the process. If serialization is not used, the corpus, as a list of documents, is simply extended.

        Parameters
        ----------
        corpus : iterable of list of (int, float)
            Corpus in BoW format

        Raises
        ------
        AssertionError
            If serialized == False and corpus isn't list.

        """
        if self.serialized:
            # Re-serialize the entire corpus while appending the new documents.
            if isinstance(corpus, MmCorpus):
                # Check that we are not attempting to overwrite the serialized corpus.
                assert self.corpus.input != corpus.input, \
                    'Input corpus cannot have the same file path as the model corpus (serialization_path).'
            corpus_chain = chain(self.corpus, corpus)  # A generator with the old and new documents.
            # Make a temporary copy of the file where the corpus is serialized.
            copyfile(self.serialization_path, self.serialization_path + '.tmp')
            self.corpus.input = self.serialization_path + '.tmp'  # Point the old corpus at this temporary file.
            # Re-serialize the old corpus, and extend it with the new corpus.
            MmCorpus.serialize(self.serialization_path, corpus_chain)
            self.corpus = MmCorpus(self.serialization_path)  # Store the new serialized corpus object in self.corpus.
            remove(self.serialization_path + '.tmp')  # Remove the temporary file again.
        else:
            # self.corpus and corpus are just lists, just extend the list.
            # First check that corpus is actually a list.
            assert isinstance(corpus, list), "If serialized == False, all input corpora must be lists."
            self.corpus.extend(corpus)

    def compute_phinorm(self, expElogthetad, expElogbetad):
        """Efficiently computes the normalizing factor in phi.

        Parameters
        ----------
        expElogthetad: numpy.ndarray
            Value of variational distribution :math:`q(\theta|\gamma)`.
        expElogbetad: numpy.ndarray
            Value of variational distribution :math:`q(\\beta|\lambda)`.

        Returns
        -------
        float
            Value of normalizing factor.

        """
        expElogtheta_sum = expElogthetad.sum(axis=0)
        phinorm = expElogtheta_sum.dot(expElogbetad) + 1e-100

        return phinorm

    def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, chunk_doc_idx=None):
        """Give a `chunk` of sparse document vectors, update gamma for each author corresponding to the `chuck`.

        Warnings
        --------
        The whole input chunk of document is assumed to fit in RAM, chunking of a large corpus must be done earlier
        in the pipeline.

        Avoids computing the `phi` variational parameter directly using the
        optimization presented in `Lee, Seung: "Algorithms for non-negative matrix factorization", NIPS 2001
        <https://papers.nips.cc/paper/1861-algorithms-for-non-negative-matrix-factorization.pdf>`_.

        Parameters
        ----------
        chunk : iterable of list of (int, float)
            Corpus in BoW format.
        author2doc : dict of (str, list of int), optional
            A dictionary where keys are the names of authors and values are lists of document IDs that the author
            contributes to.
        doc2author : dict of (int, list of str), optional
            A dictionary where the keys are document IDs and the values are lists of author names.
        rhot : float
            Value of rho for conducting inference on documents.
        collect_sstats : boolean, optional
            If True - collect sufficient statistics needed to update the model's topic-word distributions, and return
            `(gamma_chunk, sstats)`. Otherwise, return `(gamma_chunk, None)`. `gamma_chunk` is of shape
            `len(chunk_authors) x self.num_topics`,where `chunk_authors` is the number of authors in the documents in
            the current chunk.
        chunk_doc_idx : numpy.ndarray, optional
            Assigns the value for document index.

        Returns
        -------
        (numpy.ndarray, numpy.ndarray)
            gamma_chunk and sstats (if `collect_sstats == True`, otherwise - None)

        """
        try:
            len(chunk)
        except TypeError:
            # convert iterators/generators to plain list, so we have len() etc.
            chunk = list(chunk)
        if len(chunk) > 1:
            logger.debug("performing inference on a chunk of %i documents", len(chunk))

        # Initialize the variational distribution q(theta|gamma) for the chunk
        if collect_sstats:
            sstats = np.zeros_like(self.expElogbeta)
        else:
            sstats = None
        converged = 0

        # Stack all the computed gammas into this output array.
        gamma_chunk = np.zeros((0, self.num_topics))

        # Now, for each document d update gamma and phi w.r.t. all authors in those documents.
        for d, doc in enumerate(chunk):
            if chunk_doc_idx is not None:
                doc_no = chunk_doc_idx[d]
            else:
                doc_no = d
            # Get the IDs and counts of all the words in the current document.
            # TODO: this is duplication of code in LdaModel. Refactor.
            if doc and not isinstance(doc[0][0], six.integer_types + (np.integer,)):
                # make sure the term IDs are ints, otherwise np will get upset
                ids = [int(idx) for idx, _ in doc]
            else:
                ids = [idx for idx, _ in doc]
            cts = np.array([cnt for _, cnt in doc])

            # Get all authors in current document, and convert the author names to integer IDs.
            authors_d = [self.author2id[a] for a in self.doc2author[doc_no]]

            gammad = self.state.gamma[authors_d, :]  # gamma of document d before update.
            tilde_gamma = gammad.copy()  # gamma that will be updated.

            # Compute the expectation of the log of the Dirichlet parameters theta and beta.
            Elogthetad = dirichlet_expectation(tilde_gamma)
            expElogthetad = np.exp(Elogthetad)
            expElogbetad = self.expElogbeta[:, ids]

            # Compute the normalizing constant of phi for the current document.
            phinorm = self.compute_phinorm(expElogthetad, expElogbetad)

            # Iterate between gamma and phi until convergence
            for _ in xrange(self.iterations):
                lastgamma = tilde_gamma.copy()

                # Update gamma.
                # phi is computed implicitly below,
                for ai, a in enumerate(authors_d):
                    tilde_gamma[ai, :] = self.alpha + len(self.author2doc[self.id2author[a]])\
                        * expElogthetad[ai, :] * np.dot(cts / phinorm, expElogbetad.T)

                # Update gamma.
                # Interpolation between document d's "local" gamma (tilde_gamma),
                # and "global" gamma (gammad).
                tilde_gamma = (1 - rhot) * gammad + rhot * tilde_gamma

                # Update Elogtheta and Elogbeta, since gamma and lambda have been updated.
                Elogthetad = dirichlet_expectation(tilde_gamma)
                expElogthetad = np.exp(Elogthetad)

                # Update the normalizing constant in phi.
                phinorm = self.compute_phinorm(expElogthetad, expElogbetad)

                # Check for convergence.
                # Criterion is mean change in "local" gamma.
                meanchange_gamma = np.mean(abs(tilde_gamma - lastgamma))
                gamma_condition = meanchange_gamma < self.gamma_threshold
                if gamma_condition:
                    converged += 1
                    break
            # End of iterations loop.

            # Store the updated gammas in the model state.
            self.state.gamma[authors_d, :] = tilde_gamma

            # Stack the new gammas into the output array.
            gamma_chunk = np.vstack([gamma_chunk, tilde_gamma])

            if collect_sstats:
                # Contribution of document d to the expected sufficient
                # statistics for the M step.
                expElogtheta_sum_a = expElogthetad.sum(axis=0)
                sstats[:, ids] += np.outer(expElogtheta_sum_a.T, cts / phinorm)

        if len(chunk) > 1:
            logger.debug(
                "%i/%i documents converged within %i iterations",
                converged, len(chunk), self.iterations
            )

        if collect_sstats:
            # This step finishes computing the sufficient statistics for the
            # M step, so that
            # sstats[k, w] = \sum_d n_{dw} * \sum_a phi_{dwak}
            # = \sum_d n_{dw} * exp{Elogtheta_{ak} + Elogbeta_{kw}} / phinorm_{dw}.
            sstats *= self.expElogbeta
        return gamma_chunk, sstats

    def do_estep(self, chunk, author2doc, doc2author, rhot, state=None, chunk_doc_idx=None):
        """Performs inference (E-step) on a chunk of documents, and accumulate the collected sufficient statistics.

        Parameters
        ----------
        chunk : iterable of list of (int, float)
            Corpus in BoW format.
        author2doc : dict of (str, list of int), optional
            A dictionary where keys are the names of authors and values are lists of document IDs that the author
            contributes to.
        doc2author : dict of (int, list of str), optional
            A dictionary where the keys are document IDs and the values are lists of author names.
        rhot : float
            Value of rho for conducting inference on documents.
        state : int, optional
            Initializes the state for a new E iteration.
        chunk_doc_idx : numpy.ndarray, optional
            Assigns the value for document index.

        Returns
        -------
        float
            Value of gamma for training of model.

        """
        # TODO: this method is somewhat similar to the one in LdaModel. Refactor if possible.
        if state is None:
            state = self.state
        gamma, sstats = self.inference(
            chunk, author2doc, doc2author, rhot,
            collect_sstats=True, chunk_doc_idx=chunk_doc_idx
        )
        state.sstats += sstats
        state.numdocs += len(chunk)
        return gamma

    def log_perplexity(self, chunk, chunk_doc_idx=None, total_docs=None):
        """Calculate per-word likelihood bound, using the `chunk` of documents as evaluation corpus.

        Parameters
        ----------
        chunk : iterable of list of (int, float)
            Corpus in BoW format.
        chunk_doc_idx : numpy.ndarray, optional
            Assigns the value for document index.
        total_docs : int, optional
            Initializes the value for total number of documents.

        Returns
        -------
        float
            Value of per-word likelihood bound.

        """
        # TODO: This method is very similar to the one in LdaModel. Refactor.
        if total_docs is None:
            total_docs = len(chunk)
        corpus_words = sum(cnt for document in chunk for _, cnt in document)
        subsample_ratio = 1.0 * total_docs / len(chunk)
        perwordbound = self.bound(chunk, chunk_doc_idx, subsample_ratio=subsample_ratio) / \
            (subsample_ratio * corpus_words)
        logger.info(
            "%.3f per-word bound, %.1f perplexity estimate based on a corpus of %i documents with %i words",
            perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words
        )
        return perwordbound

    def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, decay=None, offset=None,
               passes=None, update_every=None, eval_every=None, iterations=None,
               gamma_threshold=None, chunks_as_numpy=False):
        """Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the
        maximum number of allowed iterations is reached).

        Notes
        -----
        This update also supports updating an already trained model (self)
        with new documents from `corpus`: the two models are then merged in proportion to the number of old vs. new
        documents. This feature is still experimental for non-stationary input streams.

        For stationary input (no topic drift in new documents), on the other hand, this equals the online update of
        `Hoffman et al. Stochastic Variational Inference
        <http://www.jmlr.org/papers/volume14/hoffman13a/hoffman13a.pdf>`_ and is guaranteed to converge for any `decay`
        in (0.5, 1.0>. Additionally, for smaller `corpus` sizes, an increasing `offset` may be beneficial (see
        Table 1 in Hoffman et al.)

        If update is called with authors that already exist in the model, it will resume training on not only new
        documents for that author, but also the previously seen documents. This is necessary for those authors' topic
        distributions to converge.

        Every time `update(corpus, author2doc)` is called, the new documents are to appended to all the previously seen
        documents, and author2doc is combined with the previously seen authors.

        To resume training on all the data seen by the model, simply call
        :meth:`~gensim.models.atmodel.AuthorTopicModel.update`.

        It is not possible to add new authors to existing documents, as all documents in `corpus` are assumed to be
        new documents.

        Parameters
        ----------
        corpus : iterable of list of (int, float)
            The corpus in BoW format.
        author2doc : dict of (str, list of int), optional
            A dictionary where keys are the names of authors and values are lists of document IDs that the author
            contributes to.
        doc2author : dict of (int, list of str), optional
            A dictionary where the keys are document IDs and the values are lists of author names.
        chunksize : int, optional
            Controls the size of the mini-batches.
        decay : float, optional
            Controls how old documents are forgotten.
        offset : float, optional
            Controls down-weighting of iterations.
        passes : int, optional
            Number of times the model makes a pass over the entire training data.
        update_every : int, optional
            Make updates in topic probability for latest mini-batch.
        eval_every : int, optional
            Calculate and estimate log perplexity for latest mini-batch.
        iterations : int, optional
            Maximum number of times the model loops over each document
        gamma_threshold : float, optional
            Threshold value of gamma(topic difference between consecutive two topics)
            until which the iterations continue.
        chunks_as_numpy : bool, optional
            Whether each chunk passed to :meth:`~gensim.models.atmodel.AuthorTopicModel.inference` should be a numpy
            array of not. Numpy can in some settings turn the term IDs into floats, these will be converted back into
            integers in inference, which incurs a performance hit. For distributed computing (not supported now)
            it may be desirable to keep the chunks as numpy arrays.

        """
        # use parameters given in constructor, unless user explicitly overrode them
        if decay is None:
            decay = self.decay
        if offset is None:
            offset = self.offset
        if passes is None:
            passes = self.passes
        if update_every is None:
            update_every = self.update_every
        if eval_every is None:
            eval_every = self.eval_every
        if iterations is None:
            iterations = self.iterations
        if gamma_threshold is None:
            gamma_threshold = self.gamma_threshold

        # TODO: if deepcopy is not used here, something goes wrong. When unit tests are run (specifically "testPasses"),
        # the process simply gets killed.
        author2doc = deepcopy(author2doc)
        doc2author = deepcopy(doc2author)

        # TODO: it is not possible to add new authors to an existing document (all input documents are treated
        # as completely new documents). Perhaps this functionality could be implemented.
        # If it's absolutely necessary, the user can delete the documents that have new authors, and call update
        # on them with the new and old authors.

        if corpus is None:
            # Just keep training on the already available data.
            # Assumes self.update() has been called before with input documents and corresponding authors.
            assert self.total_docs > 0, 'update() was called with no documents to train on.'
            train_corpus_idx = [d for d in xrange(self.total_docs)]
            num_input_authors = len(self.author2doc)
        else:
            if doc2author is None and author2doc is None:
                raise ValueError(
                    'at least one of author2doc/doc2author must be specified, to establish input space dimensionality'
                )

            # If either doc2author or author2doc is missing, construct them from the other.
            if doc2author is None:
                doc2author = construct_doc2author(corpus, author2doc)
            elif author2doc is None:
                author2doc = construct_author2doc(doc2author)

            # Number of authors that need to be updated.
            num_input_authors = len(author2doc)

            try:
                len_input_corpus = len(corpus)
            except TypeError:
                logger.warning("input corpus stream has no len(); counting documents")
                len_input_corpus = sum(1 for _ in corpus)
            if len_input_corpus == 0:
                logger.warning("AuthorTopicModel.update() called with an empty corpus")
                return

            self.total_docs += len_input_corpus

            # Add new documents in corpus to self.corpus.
            self.extend_corpus(corpus)

            # Obtain a list of new authors.
            new_authors = []
            # Sorting the author names makes the model more reproducible.
            for a in sorted(author2doc.keys()):
                if not self.author2doc.get(a):
                    new_authors.append(a)

            num_new_authors = len(new_authors)

            # Add new authors do author2id/id2author dictionaries.
            for a_id, a_name in enumerate(new_authors):
                self.author2id[a_name] = a_id + self.num_authors
                self.id2author[a_id + self.num_authors] = a_name

            # Increment the number of total authors seen.
            self.num_authors += num_new_authors

            # Initialize the variational distributions q(theta|gamma)
            gamma_new = self.random_state.gamma(100., 1. / 100., (num_new_authors, self.num_topics))
            self.state.gamma = np.vstack([self.state.gamma, gamma_new])

            # Combine author2doc with self.author2doc.
            # First, increment the document IDs by the number of previously seen documents.
            for a, doc_ids in author2doc.items():
                doc_ids = [d + self.total_docs - len_input_corpus for d in doc_ids]

            # For all authors in the input corpus, add the new documents.
            for a, doc_ids in author2doc.items():
                if self.author2doc.get(a):
                    # This is not a new author, append new documents.
                    self.author2doc[a].extend(doc_ids)
                else:
                    # This is a new author, create index.
                    self.author2doc[a] = doc_ids

            # Add all new documents to self.doc2author.
            for d, a_list in doc2author.items():
                self.doc2author[d] = a_list

            # Train on all documents of authors in input_corpus.
            train_corpus_idx = []
            for _ in author2doc.keys():  # For all authors in input corpus.
                for doc_ids in self.author2doc.values():  # For all documents in total corpus.
                    train_corpus_idx.extend(doc_ids)

            # Make the list of training documents unique.
            train_corpus_idx = list(set(train_corpus_idx))

        # train_corpus_idx is only a list of indexes, so "len" is valid.
        lencorpus = len(train_corpus_idx)

        if chunksize is None:
            chunksize = min(lencorpus, self.chunksize)

        self.state.numdocs += lencorpus

        if update_every:
            updatetype = "online"
            updateafter = min(lencorpus, update_every * self.numworkers * chunksize)
        else:
            updatetype = "batch"
            updateafter = lencorpus
        evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize)

        updates_per_pass = max(1, lencorpus / updateafter)
        logger.info(
            "running %s author-topic training, %s topics, %s authors, "
            "%i passes over the supplied corpus of %i documents, updating model once "
            "every %i documents, evaluating perplexity every %i documents, "
            "iterating %ix with a convergence threshold of %f",
            updatetype, self.num_topics, num_input_authors, passes, lencorpus, updateafter,
            evalafter, iterations, gamma_threshold
        )

        if updates_per_pass * passes < 10:
            logger.warning(
                "too few updates, training might not converge; "
                "consider increasing the number of passes or iterations to improve accuracy"
            )

        # rho is the "speed" of updating; TODO try other fncs
        # pass_ + num_updates handles increasing the starting t for each pass,
        # while allowing it to "reset" on the first pass of each update
        def rho():
            return pow(offset + pass_ + (self.num_updates / chunksize), -decay)

        for pass_ in xrange(passes):
            if self.dispatcher:
                logger.info('initializing %s workers', self.numworkers)
                self.dispatcher.reset(self.state)
            else:
                # gamma is not needed in "other", thus its shape is (0, 0).
                other = AuthorTopicState(self.eta, self.state.sstats.shape, (0, 0))
            dirty = False

            reallen = 0
            for chunk_no, chunk_doc_idx in enumerate(
                    utils.grouper(train_corpus_idx, chunksize, as_numpy=chunks_as_numpy)):
                chunk = [self.corpus[d] for d in chunk_doc_idx]
                reallen += len(chunk)  # keep track of how many documents we've processed so far

                if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
                    # log_perplexity requires the indexes of the documents being evaluated, to know what authors
                    # correspond to the documents.
                    self.log_perplexity(chunk, chunk_doc_idx, total_docs=lencorpus)

                if self.dispatcher:
                    # add the chunk to dispatcher's job queue, so workers can munch on it
                    logger.info(
                        "PROGRESS: pass %i, dispatching documents up to #%i/%i",
                        pass_, chunk_no * chunksize + len(chunk), lencorpus
                    )
                    # this will eventually block until some jobs finish, because the queue has a small finite length
                    self.dispatcher.putjob(chunk)
                else:
                    logger.info(
                        "PROGRESS: pass %i, at document #%i/%i",
                        pass_, chunk_no * chunksize + len(chunk), lencorpus
                    )
                    # do_estep requires the indexes of the documents being trained on, to know what authors
                    # correspond to the documents.
                    gammat = self.do_estep(chunk, self.author2doc, self.doc2author, rho(), other, chunk_doc_idx)

                    if self.optimize_alpha:
                        self.update_alpha(gammat, rho())

                dirty = True
                del chunk

                # perform an M step. determine when based on update_every, don't do this after every chunk
                if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0:
                    if self.dispatcher:
                        # distributed mode: wait for all workers to finish
                        logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                        other = self.dispatcher.getstate()
                    self.do_mstep(rho(), other, pass_ > 0)
                    del other  # frees up memory

                    if self.dispatcher:
                        logger.info('initializing workers')
                        self.dispatcher.reset(self.state)
                    else:
                        other = AuthorTopicState(self.eta, self.state.sstats.shape, (0, 0))
                    dirty = False
            # endfor single corpus iteration
            if reallen != lencorpus:
                raise RuntimeError("input corpus size changed during training (don't use generators as input)")

            if dirty:
                # finish any remaining updates
                if self.dispatcher:
                    # distributed mode: wait for all workers to finish
                    logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                    other = self.dispatcher.getstate()
                self.do_mstep(rho(), other, pass_ > 0)
                del other

    def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, doc2author=None):
        """Estimate the variational bound of documents from `corpus`.

        :math:`\mathbb{E_{q}}[\log p(corpus)] - \mathbb{E_{q}}[\log q(corpus)]`

        Notes
        -----
        There are basically two use cases of this method:

        #. `chunk` is a subset of the training corpus, and `chunk_doc_idx` is provided,
           indicating the indexes of the documents in the training corpus.
        #. `chunk` is a test set (held-out data), and `author2doc` and `doc2author` corresponding to this test set
           are provided. There must not be any new authors passed to this method, `chunk_doc_idx` is not needed
           in this case.

        Parameters
        ----------
        chunk : iterable of list of (int, float)
            Corpus in BoW format.
        chunk_doc_idx : numpy.ndarray, optional
            Assigns the value for document index.
        subsample_ratio : float, optional
            Used for calculation of word score for estimation of variational bound.
        author2doc : dict of (str, list of int), optinal
            A dictionary where keys are the names of authors and values are lists of documents that the author
            contributes to.
        doc2author : dict of (int, list of str), optional
            A dictionary where the keys are document IDs and the values are lists of author names.

        Returns
        -------
        float
            Value of variational bound score.

        """
        # TODO: enable evaluation of documents with new authors. One could, for example, make it
        # possible to pass a list of documents to self.inference with no author dictionaries,
        # assuming all the documents correspond to one (unseen) author, learn the author's
        # gamma, and return gamma (without adding it to self.state.gamma). Of course,
        # collect_sstats should be set to false, so that the model is not updated w.r.t. these
        # new documents.

        _lambda = self.state.get_lambda()
        Elogbeta = dirichlet_expectation(_lambda)
        expElogbeta = np.exp(Elogbeta)

        gamma = self.state.gamma

        if author2doc is None and doc2author is None:
            # Evaluating on training documents (chunk of self.corpus).
            author2doc = self.author2doc
            doc2author = self.doc2author

            if not chunk_doc_idx:
                # If author2doc and doc2author are not provided, chunk is assumed to be a subset of
                # self.corpus, and chunk_doc_idx is thus required.
                raise ValueError(
                    'Either author dictionaries or chunk_doc_idx must be provided. '
                    'Consult documentation of bound method.'
                )
        elif author2doc is not None and doc2author is not None:
            # Training on held-out documents (documents not seen during training).
            # All authors in dictionaries must still be seen during training.
            for a in author2doc.keys():
                if not self.author2doc.get(a):
                    raise ValueError('bound cannot be called with authors not seen during training.')

            if chunk_doc_idx:
                raise ValueError(
                    'Either author dictionaries or chunk_doc_idx must be provided, not both. '
                    'Consult documentation of bound method.'
                )
        else:
            raise ValueError(
                'Either both author2doc and doc2author should be provided, or neither. '
                'Consult documentation of bound method.'
            )

        Elogtheta = dirichlet_expectation(gamma)
        expElogtheta = np.exp(Elogtheta)

        word_score = 0.0
        theta_score = 0.0
        for d, doc in enumerate(chunk):
            if chunk_doc_idx:
                doc_no = chunk_doc_idx[d]
            else:
                doc_no = d
            # Get all authors in current document, and convert the author names to integer IDs.
            authors_d = [self.author2id[a] for a in self.doc2author[doc_no]]
            ids = np.array([id for id, _ in doc])  # Word IDs in doc.
            cts = np.array([cnt for _, cnt in doc])  # Word counts.

            if d % self.chunksize == 0:
                logger.debug("bound: at document #%i in chunk", d)

            # Computing the bound requires summing over expElogtheta[a, k] * expElogbeta[k, v], which
            # is the same computation as in normalizing phi.
            phinorm = self.compute_phinorm(expElogtheta[authors_d, :], expElogbeta[:, ids])
            word_score += np.log(1.0 / len(authors_d)) * sum(cts) + cts.dot(np.log(phinorm))

        # Compensate likelihood for when `chunk` above is only a sample of the whole corpus. This ensures
        # that the likelihood is always roughly on the same scale.
        word_score *= subsample_ratio

        # E[log p(theta | alpha) - log q(theta | gamma)]
        for a in author2doc.keys():
            a = self.author2id[a]
            theta_score += np.sum((self.alpha - gamma[a, :]) * Elogtheta[a, :])
            theta_score += np.sum(gammaln(gamma[a, :]) - gammaln(self.alpha))
            theta_score += gammaln(np.sum(self.alpha)) - gammaln(np.sum(gamma[a, :]))

        # theta_score is rescaled in a similar fashion.
        # TODO: treat this in a more general way, similar to how it is done with word_score.
        theta_score *= self.num_authors / len(author2doc)

        # E[log p(beta | eta) - log q (beta | lambda)]
        beta_score = 0.0
        beta_score += np.sum((self.eta - _lambda) * Elogbeta)
        beta_score += np.sum(gammaln(_lambda) - gammaln(self.eta))
        sum_eta = np.sum(self.eta)
        beta_score += np.sum(gammaln(sum_eta) - gammaln(np.sum(_lambda, 1)))

        total_score = word_score + theta_score + beta_score

        return total_score

    def get_document_topics(self, word_id, minimum_probability=None):
        """Override :meth:`~gensim.models.ldamodel.LdaModel.get_document_topics` and simply raises an exception.

        Warnings
        --------
        This method invalid for model, use :meth:`~gensim.models.atmodel.AuthorTopicModel.get_author_topics` or
        :meth:`~gensim.models.atmodel.AuthorTopicModel.get_new_author_topics` instead.

        Raises
        ------
        NotImplementedError
            Always.

        """
        raise NotImplementedError(
            'Method "get_document_topics" is not valid for the author-topic model. '
            'Use the "get_author_topics" method.'
        )

    def get_new_author_topics(self, corpus, minimum_probability=None):
        """Infers topics for new author.

        Infers a topic distribution for a new author over the passed corpus of docs,
        assuming that all documents are from this single new author.

        Parameters
        ----------
        corpus : iterable of list of (int, float)
            Corpus in BoW format.
        minimum_probability : float, optional
            Ignore topics with probability below this value, if None - 1e-8 is used.

        Returns
        -------
        list of (int, float)
            Topic distribution for the given `corpus`.

        """
        def rho():
            return pow(self.offset + 1 + 1, -self.decay)

        def rollback_new_author_chages():
            self.state.gamma = self.state.gamma[0:-1]

            del self.author2doc[new_author_name]
            a_id = self.author2id[new_author_name]
            del self.id2author[a_id]
            del self.author2id[new_author_name]

            for new_doc_id in corpus_doc_idx:
                del self.doc2author[new_doc_id]

        try:
            len_input_corpus = len(corpus)
        except TypeError:
            logger.warning("input corpus stream has no len(); counting documents")
            len_input_corpus = sum(1 for _ in corpus)
        if len_input_corpus == 0:
            raise ValueError("AuthorTopicModel.get_new_author_topics() called with an empty corpus")

        new_author_name = "placeholder_name"
        # indexes representing the documents in the input corpus
        corpus_doc_idx = list(range(self.total_docs, self.total_docs + len_input_corpus))

        # Add the new placeholder author to author2id/id2author dictionaries.
        num_new_authors = 1
        author_id = self.num_authors
        if new_author_name in self.author2id:
            raise ValueError("self.author2id already has 'placeholder_name' author")
        self.author2id[new_author_name] = author_id
        self.id2author[author_id] = new_author_name

        # Add new author in author2doc and doc into doc2author.
        self.author2doc[new_author_name] = corpus_doc_idx
        for new_doc_id in corpus_doc_idx:
            self.doc2author[new_doc_id] = [new_author_name]

        gamma_new = self.random_state.gamma(100., 1. / 100., (num_new_authors, self.num_topics))
        self.state.gamma = np.vstack([self.state.gamma, gamma_new])

        # Should not record the sstats, as we are goint to delete the new author after calculated.
        try:
            gammat, _ = self.inference(
                corpus, self.author2doc, self.doc2author, rho(),
                collect_sstats=False, chunk_doc_idx=corpus_doc_idx
            )
            new_author_topics = self.get_author_topics(new_author_name, minimum_probability)
        finally:
            rollback_new_author_chages()
        return new_author_topics

    def get_author_topics(self, author_name, minimum_probability=None):
        """Get topic distribution the given author.

        Parameters
        ----------
        author_name : str
            Name of the author for which the topic distribution needs to be estimated.
        minimum_probability : float, optional
            Sets the minimum probability value for showing the topics of a given author, topics with probability <
            `minimum_probability` will be ignored.

        Returns
        -------
        list of (int, float)
            Topic distribution of an author.

        Example
        -------
        >>> from gensim.models import AuthorTopicModel
        >>> from gensim.corpora import mmcorpus
        >>> from gensim.test.utils import common_dictionary, datapath, temporary_file

        >>> author2doc = {
        ...     'john': [0, 1, 2, 3, 4, 5, 6],
        ...     'jane': [2, 3, 4, 5, 6, 7, 8],
        ...     'jack': [0, 2, 4, 6, 8]
        ... }
        >>>
        >>> corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
        >>>
        >>> with temporary_file("serialized") as s_path:
        ...     model = AuthorTopicModel(
        ...          corpus, author2doc=author2doc, id2word=common_dictionary, num_topics=4,
        ...          serialized=True, serialization_path=s_path
        ...     )
        ...
        ...     model.update(corpus, author2doc)  # update the author-topic model with additional documents
        >>>
        >>> # construct vectors for authors
        >>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]

        """
        author_id = self.author2id[author_name]

        if minimum_probability is None:
            minimum_probability = self.minimum_probability
        minimum_probability = max(minimum_probability, 1e-8)  # never allow zero values in sparse output

        topic_dist = self.state.gamma[author_id, :] / sum(self.state.gamma[author_id, :])

        author_topics = [
            (topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)
            if topicvalue >= minimum_probability
        ]

        return author_topics

    def __getitem__(self, author_names, eps=None):
        """Get topic distribution for input `author_names`.

        Parameters
        ----------
        author_names : {str, list of str}
            Name(s) of the author for which the topic distribution needs to be estimated.
        eps : float, optional
            The minimum probability value for showing the topics of a given author, topics with probability < `eps`
            will be ignored.

        Returns
        -------
        list of (int, float) **or** list of list of (int, float)
            Topic distribution for the author(s), type depends on type of `author_names`.

        """
        if isinstance(author_names, list):
            items = []
            for a in author_names:
                items.append(self.get_author_topics(a, minimum_probability=eps))
        else:
            items = self.get_author_topics(author_names, minimum_probability=eps)

        return items