146 lines
6.6 KiB
Python
146 lines
6.6 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Author: Chinmaya Pancholi <chinmayapancholi13@gmail.com>
|
|
# Copyright (C) 2017 Radim Rehurek <radimrehurek@seznam.cz>
|
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
|
|
|
"""Scikit learn interface for :class:`~gensim.models.ldaseqmodel.LdaSeqModel`.
|
|
|
|
Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.
|
|
|
|
Examples
|
|
--------
|
|
>>> from gensim.test.utils import common_corpus, common_dictionary
|
|
>>> from gensim.sklearn_api.ldaseqmodel import LdaSeqTransformer
|
|
>>>
|
|
>>> # Create a sequential LDA transformer to extract 2 topics from the common corpus.
|
|
>>> # Divide the work into 3 unequal time slices.
|
|
>>> model = LdaSeqTransformer(id2word=common_dictionary, num_topics=2, time_slice=[3, 4, 2], initialize='gensim')
|
|
>>>
|
|
>>> # Each document almost entirely belongs to one of the two topics.
|
|
>>> transformed_corpus = model.fit_transform(common_corpus)
|
|
|
|
"""
|
|
import numpy as np
|
|
from sklearn.base import TransformerMixin, BaseEstimator
|
|
from sklearn.exceptions import NotFittedError
|
|
|
|
from gensim import models
|
|
|
|
|
|
class LdaSeqTransformer(TransformerMixin, BaseEstimator):
|
|
"""Base Sequential LDA module, wraps :class:`~gensim.models.ldaseqmodel.LdaSeqModel` model.
|
|
|
|
For more information take a look at `David M. Blei, John D. Lafferty: "Dynamic Topic Models"
|
|
<https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf>`_.
|
|
|
|
"""
|
|
def __init__(self, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None,
|
|
lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None,
|
|
lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100):
|
|
"""
|
|
|
|
Parameters
|
|
----------
|
|
time_slice : list of int, optional
|
|
Number of documents in each time-slice.
|
|
id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
|
|
Mapping from an ID to the word it represents in the vocabulary.
|
|
alphas : float, optional
|
|
The prior probability of each topic.
|
|
num_topics : int, optional
|
|
Number of latent topics to be discovered in the corpus.
|
|
initialize : {'gensim', 'own', 'ldamodel'}, optional
|
|
Controls the initialization of the DTM model. Supports three different modes:
|
|
* 'gensim': Uses gensim's own LDA initialization.
|
|
* 'own': Uses your own initialization matrix of an LDA model that has been previously trained.
|
|
* 'lda_model': Use a previously used LDA model, passing it through the `lda_model` argument.
|
|
sstats : np.ndarray of shape [vocab_len, `num_topics`], optional
|
|
If `initialize` is set to 'own' this will be used to initialize the DTM model.
|
|
lda_model : :class:`~gensim.models.ldamodel.LdaModel`, optional
|
|
If `initialize` is set to 'lda_model' this object will be used to create the `sstats` initialization matrix.
|
|
obs_variance : float, optional
|
|
Observed variance used to approximate the true and forward variance as shown in
|
|
`David M. Blei, John D. Lafferty: "Dynamic Topic Models"
|
|
<https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf>`_.
|
|
chain_variance : float, optional
|
|
Gaussian parameter defined in the beta distribution to dictate how the beta values evolve.
|
|
passes : int, optional
|
|
Number of passes over the corpus for the initial :class:`~gensim.models.ldamodel.LdaModel`
|
|
random_state : {numpy.random.RandomState, int}, optional
|
|
Can be a np.random.RandomState object, or the seed to generate one. Used for reproducibility of results.
|
|
lda_inference_max_iter : int, optional
|
|
Maximum number of iterations in the inference step of the LDA training.
|
|
em_min_iter : int, optional
|
|
Minimum number of iterations until converge of the Expectation-Maximization algorithm
|
|
em_max_iter : int, optional
|
|
Maximum number of iterations until converge of the Expectation-Maximization algorithm
|
|
chunksize : int, optional
|
|
Number of documents in the corpus do be processed in in a chunk.
|
|
|
|
"""
|
|
self.gensim_model = None
|
|
self.time_slice = time_slice
|
|
self.id2word = id2word
|
|
self.alphas = alphas
|
|
self.num_topics = num_topics
|
|
self.initialize = initialize
|
|
self.sstats = sstats
|
|
self.lda_model = lda_model
|
|
self.obs_variance = obs_variance
|
|
self.chain_variance = chain_variance
|
|
self.passes = passes
|
|
self.random_state = random_state
|
|
self.lda_inference_max_iter = lda_inference_max_iter
|
|
self.em_min_iter = em_min_iter
|
|
self.em_max_iter = em_max_iter
|
|
self.chunksize = chunksize
|
|
|
|
def fit(self, X, y=None):
|
|
"""Fit the model according to the given training data.
|
|
|
|
Parameters
|
|
----------
|
|
X : {iterable of list of (int, number), scipy.sparse matrix}
|
|
A collection of documents in BOW format used for training the model.
|
|
|
|
Returns
|
|
-------
|
|
:class:`~gensim.sklearn_api.ldaseqmodel.LdaSeqTransformer`
|
|
The trained model.
|
|
|
|
"""
|
|
self.gensim_model = models.LdaSeqModel(
|
|
corpus=X, time_slice=self.time_slice, id2word=self.id2word,
|
|
alphas=self.alphas, num_topics=self.num_topics, initialize=self.initialize, sstats=self.sstats,
|
|
lda_model=self.lda_model, obs_variance=self.obs_variance, chain_variance=self.chain_variance,
|
|
passes=self.passes, random_state=self.random_state, lda_inference_max_iter=self.lda_inference_max_iter,
|
|
em_min_iter=self.em_min_iter, em_max_iter=self.em_max_iter, chunksize=self.chunksize
|
|
)
|
|
return self
|
|
|
|
def transform(self, docs):
|
|
"""Infer the topic distribution for `docs`.
|
|
|
|
Parameters
|
|
----------
|
|
docs : {iterable of list of (int, number), scipy.sparse matrix}
|
|
A collection of documents in BOW format to be transformed.
|
|
|
|
Returns
|
|
-------
|
|
numpy.ndarray of shape [`len(docs)`, `num_topics`]
|
|
The topic representation of each document.
|
|
|
|
"""
|
|
if self.gensim_model is None:
|
|
raise NotFittedError(
|
|
"This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
|
|
)
|
|
|
|
# The input as array of array
|
|
if isinstance(docs[0], tuple):
|
|
docs = [docs]
|
|
proportions = [self.gensim_model[doc] for doc in docs]
|
|
return np.reshape(np.array(proportions), (len(docs), self.num_topics))
|