laywerrobot/lib/python3.6/site-packages/gensim/sklearn_api/ldaseqmodel.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Chinmaya Pancholi <chinmayapancholi13@gmail.com>
# Copyright (C) 2017 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""Scikit learn interface for :class:`~gensim.models.ldaseqmodel.LdaSeqModel`.

Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.

Examples
--------
>>> from gensim.test.utils import common_corpus, common_dictionary
>>> from gensim.sklearn_api.ldaseqmodel import LdaSeqTransformer
>>>
>>> # Create a sequential LDA transformer to extract 2 topics from the common corpus.
>>> # Divide the work into 3 unequal time slices.
>>> model = LdaSeqTransformer(id2word=common_dictionary, num_topics=2, time_slice=[3, 4, 2], initialize='gensim')
>>>
>>> # Each document almost entirely belongs to one of the two topics.
>>> transformed_corpus = model.fit_transform(common_corpus)

"""
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError

from gensim import models


class LdaSeqTransformer(TransformerMixin, BaseEstimator):
    """Base Sequential LDA module, wraps :class:`~gensim.models.ldaseqmodel.LdaSeqModel` model.

    For more information take a look at `David M. Blei, John D. Lafferty: "Dynamic Topic Models"
    <https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf>`_.

    """
    def __init__(self, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None,
                 lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None,
                 lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100):
        """

        Parameters
        ----------
        time_slice : list of int, optional
            Number of documents in each time-slice.
        id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            Mapping from an ID to the word it represents in the vocabulary.
        alphas : float, optional
            The prior probability of each topic.
        num_topics : int, optional
            Number of latent topics to be discovered in the corpus.
        initialize : {'gensim', 'own', 'ldamodel'}, optional
            Controls the initialization of the DTM model. Supports three different modes:
                * 'gensim': Uses gensim's own LDA initialization.
                * 'own': Uses your own initialization matrix of an LDA model that has been previously trained.
                * 'lda_model': Use a previously used LDA model, passing it through the `lda_model` argument.
        sstats : np.ndarray of shape [vocab_len, `num_topics`], optional
            If `initialize` is set to 'own' this will be used to initialize the DTM model.
        lda_model : :class:`~gensim.models.ldamodel.LdaModel`, optional
            If `initialize` is set to 'lda_model' this object will be used to create the `sstats` initialization matrix.
        obs_variance : float, optional
            Observed variance used to approximate the true and forward variance as shown in
            `David M. Blei, John D. Lafferty: "Dynamic Topic Models"
            <https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf>`_.
        chain_variance : float, optional
            Gaussian parameter defined in the beta distribution to dictate how the beta values evolve.
        passes : int, optional
            Number of passes over the corpus for the initial :class:`~gensim.models.ldamodel.LdaModel`
        random_state : {numpy.random.RandomState, int}, optional
            Can be a np.random.RandomState object, or the seed to generate one. Used for reproducibility of results.
        lda_inference_max_iter : int, optional
            Maximum number of iterations in the inference step of the LDA training.
        em_min_iter : int, optional
            Minimum number of iterations until converge of the Expectation-Maximization algorithm
        em_max_iter : int, optional
            Maximum number of iterations until converge of the Expectation-Maximization algorithm
        chunksize : int, optional
            Number of documents in the corpus do be processed in in a chunk.

        """
        self.gensim_model = None
        self.time_slice = time_slice
        self.id2word = id2word
        self.alphas = alphas
        self.num_topics = num_topics
        self.initialize = initialize
        self.sstats = sstats
        self.lda_model = lda_model
        self.obs_variance = obs_variance
        self.chain_variance = chain_variance
        self.passes = passes
        self.random_state = random_state
        self.lda_inference_max_iter = lda_inference_max_iter
        self.em_min_iter = em_min_iter
        self.em_max_iter = em_max_iter
        self.chunksize = chunksize

    def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : {iterable of list of (int, number), scipy.sparse matrix}
            A collection of documents in BOW format used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.ldaseqmodel.LdaSeqTransformer`
            The trained model.

        """
        self.gensim_model = models.LdaSeqModel(
            corpus=X, time_slice=self.time_slice, id2word=self.id2word,
            alphas=self.alphas, num_topics=self.num_topics, initialize=self.initialize, sstats=self.sstats,
            lda_model=self.lda_model, obs_variance=self.obs_variance, chain_variance=self.chain_variance,
            passes=self.passes, random_state=self.random_state, lda_inference_max_iter=self.lda_inference_max_iter,
            em_min_iter=self.em_min_iter, em_max_iter=self.em_max_iter, chunksize=self.chunksize
        )
        return self

    def transform(self, docs):
        """Infer the topic distribution for `docs`.

        Parameters
        ----------
        docs : {iterable of list of (int, number), scipy.sparse matrix}
            A collection of documents in BOW format to be transformed.

        Returns
        -------
        numpy.ndarray of shape [`len(docs)`, `num_topics`]
            The topic representation of each document.

        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        # The input as array of array
        if isinstance(docs[0], tuple):
            docs = [docs]
        proportions = [self.gensim_model[doc] for doc in docs]
        return np.reshape(np.array(proportions), (len(docs), self.num_topics))
first commit 2020-08-27 21:55:39 +02:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
			`#`
			`# Author: Chinmaya Pancholi <chinmayapancholi13@gmail.com>`
			`# Copyright (C) 2017 Radim Rehurek <radimrehurek@seznam.cz>`
			`# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html`

			"""Scikit learn interface for :class:`~gensim.models.ldaseqmodel.LdaSeqModel`.

			`Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.`

			`Examples`
			`--------`
			`>>> from gensim.test.utils import common_corpus, common_dictionary`
			`>>> from gensim.sklearn_api.ldaseqmodel import LdaSeqTransformer`
			`>>>`
			`>>> # Create a sequential LDA transformer to extract 2 topics from the common corpus.`
			`>>> # Divide the work into 3 unequal time slices.`
			`>>> model = LdaSeqTransformer(id2word=common_dictionary, num_topics=2, time_slice=[3, 4, 2], initialize='gensim')`
			`>>>`
			`>>> # Each document almost entirely belongs to one of the two topics.`
			`>>> transformed_corpus = model.fit_transform(common_corpus)`

			`"""`
			`import numpy as np`
			`from sklearn.base import TransformerMixin, BaseEstimator`
			`from sklearn.exceptions import NotFittedError`

			`from gensim import models`


			`class LdaSeqTransformer(TransformerMixin, BaseEstimator):`
			"""Base Sequential LDA module, wraps :class:`~gensim.models.ldaseqmodel.LdaSeqModel` model.

			For more information take a look at `David M. Blei, John D. Lafferty: "Dynamic Topic Models"
			<https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf>`_.

			`"""`
			`def __init__(self, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None,`
			`lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None,`
			`lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100):`
			`"""`

			`Parameters`
			`----------`
			`time_slice : list of int, optional`
			`Number of documents in each time-slice.`
			id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
			`Mapping from an ID to the word it represents in the vocabulary.`
			`alphas : float, optional`
			`The prior probability of each topic.`
			`num_topics : int, optional`
			`Number of latent topics to be discovered in the corpus.`
			`initialize : {'gensim', 'own', 'ldamodel'}, optional`
			`Controls the initialization of the DTM model. Supports three different modes:`
			`* 'gensim': Uses gensim's own LDA initialization.`
			`* 'own': Uses your own initialization matrix of an LDA model that has been previously trained.`
			* 'lda_model': Use a previously used LDA model, passing it through the `lda_model` argument.
			sstats : np.ndarray of shape [vocab_len, `num_topics`], optional
			If `initialize` is set to 'own' this will be used to initialize the DTM model.
			lda_model : :class:`~gensim.models.ldamodel.LdaModel`, optional
			If `initialize` is set to 'lda_model' this object will be used to create the `sstats` initialization matrix.
			`obs_variance : float, optional`
			`Observed variance used to approximate the true and forward variance as shown in`
			`David M. Blei, John D. Lafferty: "Dynamic Topic Models"
			<https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf>`_.
			`chain_variance : float, optional`
			`Gaussian parameter defined in the beta distribution to dictate how the beta values evolve.`
			`passes : int, optional`
			Number of passes over the corpus for the initial :class:`~gensim.models.ldamodel.LdaModel`
			`random_state : {numpy.random.RandomState, int}, optional`
			`Can be a np.random.RandomState object, or the seed to generate one. Used for reproducibility of results.`
			`lda_inference_max_iter : int, optional`
			`Maximum number of iterations in the inference step of the LDA training.`
			`em_min_iter : int, optional`
			`Minimum number of iterations until converge of the Expectation-Maximization algorithm`
			`em_max_iter : int, optional`
			`Maximum number of iterations until converge of the Expectation-Maximization algorithm`
			`chunksize : int, optional`
			`Number of documents in the corpus do be processed in in a chunk.`

			`"""`
			`self.gensim_model = None`
			`self.time_slice = time_slice`
			`self.id2word = id2word`
			`self.alphas = alphas`
			`self.num_topics = num_topics`
			`self.initialize = initialize`
			`self.sstats = sstats`
			`self.lda_model = lda_model`
			`self.obs_variance = obs_variance`
			`self.chain_variance = chain_variance`
			`self.passes = passes`
			`self.random_state = random_state`
			`self.lda_inference_max_iter = lda_inference_max_iter`
			`self.em_min_iter = em_min_iter`
			`self.em_max_iter = em_max_iter`
			`self.chunksize = chunksize`

			`def fit(self, X, y=None):`
			`"""Fit the model according to the given training data.`

			`Parameters`
			`----------`
			`X : {iterable of list of (int, number), scipy.sparse matrix}`
			`A collection of documents in BOW format used for training the model.`

			`Returns`
			`-------`
			:class:`~gensim.sklearn_api.ldaseqmodel.LdaSeqTransformer`
			`The trained model.`

			`"""`
			`self.gensim_model = models.LdaSeqModel(`
			`corpus=X, time_slice=self.time_slice, id2word=self.id2word,`
			`alphas=self.alphas, num_topics=self.num_topics, initialize=self.initialize, sstats=self.sstats,`
			`lda_model=self.lda_model, obs_variance=self.obs_variance, chain_variance=self.chain_variance,`
			`passes=self.passes, random_state=self.random_state, lda_inference_max_iter=self.lda_inference_max_iter,`
			`em_min_iter=self.em_min_iter, em_max_iter=self.em_max_iter, chunksize=self.chunksize`
			`)`
			`return self`

			`def transform(self, docs):`
			"""Infer the topic distribution for `docs`.

			`Parameters`
			`----------`
			`docs : {iterable of list of (int, number), scipy.sparse matrix}`
			`A collection of documents in BOW format to be transformed.`

			`Returns`
			`-------`
			numpy.ndarray of shape [`len(docs)`, `num_topics`]
			`The topic representation of each document.`

			`"""`
			`if self.gensim_model is None:`
			`raise NotFittedError(`
			`"This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."`
			`)`

			`# The input as array of array`
			`if isinstance(docs[0], tuple):`
			`docs = [docs]`
			`proportions = [self.gensim_model[doc] for doc in docs]`
			`return np.reshape(np.array(proportions), (len(docs), self.num_topics))`