laywerrobot/lib/python3.6/site-packages/gensim/sklearn_api/ldaseqmodel.py
2020-08-27 21:55:39 +02:00

146 lines
6.6 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Chinmaya Pancholi <chinmayapancholi13@gmail.com>
# Copyright (C) 2017 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Scikit learn interface for :class:`~gensim.models.ldaseqmodel.LdaSeqModel`.
Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.
Examples
--------
>>> from gensim.test.utils import common_corpus, common_dictionary
>>> from gensim.sklearn_api.ldaseqmodel import LdaSeqTransformer
>>>
>>> # Create a sequential LDA transformer to extract 2 topics from the common corpus.
>>> # Divide the work into 3 unequal time slices.
>>> model = LdaSeqTransformer(id2word=common_dictionary, num_topics=2, time_slice=[3, 4, 2], initialize='gensim')
>>>
>>> # Each document almost entirely belongs to one of the two topics.
>>> transformed_corpus = model.fit_transform(common_corpus)
"""
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError
from gensim import models
class LdaSeqTransformer(TransformerMixin, BaseEstimator):
"""Base Sequential LDA module, wraps :class:`~gensim.models.ldaseqmodel.LdaSeqModel` model.
For more information take a look at `David M. Blei, John D. Lafferty: "Dynamic Topic Models"
<https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf>`_.
"""
def __init__(self, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None,
lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None,
lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100):
"""
Parameters
----------
time_slice : list of int, optional
Number of documents in each time-slice.
id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
Mapping from an ID to the word it represents in the vocabulary.
alphas : float, optional
The prior probability of each topic.
num_topics : int, optional
Number of latent topics to be discovered in the corpus.
initialize : {'gensim', 'own', 'ldamodel'}, optional
Controls the initialization of the DTM model. Supports three different modes:
* 'gensim': Uses gensim's own LDA initialization.
* 'own': Uses your own initialization matrix of an LDA model that has been previously trained.
* 'lda_model': Use a previously used LDA model, passing it through the `lda_model` argument.
sstats : np.ndarray of shape [vocab_len, `num_topics`], optional
If `initialize` is set to 'own' this will be used to initialize the DTM model.
lda_model : :class:`~gensim.models.ldamodel.LdaModel`, optional
If `initialize` is set to 'lda_model' this object will be used to create the `sstats` initialization matrix.
obs_variance : float, optional
Observed variance used to approximate the true and forward variance as shown in
`David M. Blei, John D. Lafferty: "Dynamic Topic Models"
<https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf>`_.
chain_variance : float, optional
Gaussian parameter defined in the beta distribution to dictate how the beta values evolve.
passes : int, optional
Number of passes over the corpus for the initial :class:`~gensim.models.ldamodel.LdaModel`
random_state : {numpy.random.RandomState, int}, optional
Can be a np.random.RandomState object, or the seed to generate one. Used for reproducibility of results.
lda_inference_max_iter : int, optional
Maximum number of iterations in the inference step of the LDA training.
em_min_iter : int, optional
Minimum number of iterations until converge of the Expectation-Maximization algorithm
em_max_iter : int, optional
Maximum number of iterations until converge of the Expectation-Maximization algorithm
chunksize : int, optional
Number of documents in the corpus do be processed in in a chunk.
"""
self.gensim_model = None
self.time_slice = time_slice
self.id2word = id2word
self.alphas = alphas
self.num_topics = num_topics
self.initialize = initialize
self.sstats = sstats
self.lda_model = lda_model
self.obs_variance = obs_variance
self.chain_variance = chain_variance
self.passes = passes
self.random_state = random_state
self.lda_inference_max_iter = lda_inference_max_iter
self.em_min_iter = em_min_iter
self.em_max_iter = em_max_iter
self.chunksize = chunksize
def fit(self, X, y=None):
"""Fit the model according to the given training data.
Parameters
----------
X : {iterable of list of (int, number), scipy.sparse matrix}
A collection of documents in BOW format used for training the model.
Returns
-------
:class:`~gensim.sklearn_api.ldaseqmodel.LdaSeqTransformer`
The trained model.
"""
self.gensim_model = models.LdaSeqModel(
corpus=X, time_slice=self.time_slice, id2word=self.id2word,
alphas=self.alphas, num_topics=self.num_topics, initialize=self.initialize, sstats=self.sstats,
lda_model=self.lda_model, obs_variance=self.obs_variance, chain_variance=self.chain_variance,
passes=self.passes, random_state=self.random_state, lda_inference_max_iter=self.lda_inference_max_iter,
em_min_iter=self.em_min_iter, em_max_iter=self.em_max_iter, chunksize=self.chunksize
)
return self
def transform(self, docs):
"""Infer the topic distribution for `docs`.
Parameters
----------
docs : {iterable of list of (int, number), scipy.sparse matrix}
A collection of documents in BOW format to be transformed.
Returns
-------
numpy.ndarray of shape [`len(docs)`, `num_topics`]
The topic representation of each document.
"""
if self.gensim_model is None:
raise NotFittedError(
"This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
)
# The input as array of array
if isinstance(docs[0], tuple):
docs = [docs]
proportions = [self.gensim_model[doc] for doc in docs]
return np.reshape(np.array(proportions), (len(docs), self.num_topics))