laywerrobot/lib/python3.6/site-packages/gensim/sklearn_api/ldamodel.py
2020-08-27 21:55:39 +02:00

244 lines
11 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Chinmaya Pancholi <chinmayapancholi13@gmail.com>
# Copyright (C) 2017 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Scikit learn interface for :class:`~gensim.models.ldamodel.LdaModel`.
Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.
Examples
--------
>>> from gensim.test.utils import common_corpus, common_dictionary
>>> from gensim.sklearn_api import LdaTransformer
>>>
>>> # Reduce each document to 2 dimensions (topics) using the sklearn interface.
>>> model = LdaTransformer(num_topics=2, id2word=common_dictionary, iterations=20, random_state=1)
>>> docvecs = model.fit_transform(common_corpus)
"""
import numpy as np
from scipy import sparse
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError
from gensim import models
from gensim import matutils
class LdaTransformer(TransformerMixin, BaseEstimator):
"""Base LDA module, wraps :class:`~gensim.models.ldamodel.LdaModel`.
The inner workings of this class depends heavily on `Matthew D. Hoffman, David M. Blei, Francis Bach:
"Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_ and
`David M. Blei, Andrew Y. Ng, Michael I. Jordan: "Latent Dirichlet Allocation"
<http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf>`_.
"""
def __init__(self, num_topics=100, id2word=None, chunksize=2000, passes=1, update_every=1, alpha='symmetric',
eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001,
minimum_probability=0.01, random_state=None, scorer='perplexity', dtype=np.float32):
"""
Parameters
----------
num_topics : int, optional
The number of requested latent topics to be extracted from the training corpus.
id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
Mapping from integer ID to words in the corpus. Used to determine vocabulary size and logging.
chunksize : int, optional
Number of documents in batch.
passes : int, optional
Number of passes through the corpus during training.
update_every : int, optional
Number of documents to be iterated through for each update.
Set to 0 for batch learning, > 1 for online iterative learning.
alpha : {np.ndarray, str}, optional
Can be set to an 1D array of length equal to the number of expected topics that expresses
our a-priori belief for the each topics' probability.
Alternatively default prior selecting strategies can be employed by supplying a string:
* 'asymmetric': Uses a fixed normalized assymetric prior of `1.0 / topicno`.
* 'default': Learns an assymetric prior from the corpus.
eta : {float, np.array, str}, optional
A-priori belief on word probability, this can be:
* scalar for a symmetric prior over topic/word probability,
* vector of length num_words to denote an asymmetric user defined probability for each word,
* matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination,
* the string 'auto' to learn the asymmetric prior from the data.
decay : float, optional
A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten
when each new document is examined. Corresponds to Kappa from
`Matthew D. Hoffman, David M. Blei, Francis Bach:
"Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
offset : float, optional
Hyper-parameter that controls how much we will slow down the first steps the first few iterations.
Corresponds to Tau_0 from `Matthew D. Hoffman, David M. Blei, Francis Bach:
"Online Learning for Latent Dirichlet Allocation NIPS'10" <https://www.di.ens.fr/~fbach/mdhnips2010.pdf>`_.
eval_every : int, optional
Log perplexity is estimated every that many updates. Setting this to one slows down training by ~2x.
iterations : int, optional
Maximum number of iterations through the corpus when inferring the topic distribution of a corpus.
gamma_threshold : float, optional
Minimum change in the value of the gamma parameters to continue iterating.
minimum_probability : float, optional
Topics with a probability lower than this threshold will be filtered out.
random_state : {np.random.RandomState, int}, optional
Either a randomState object or a seed to generate one. Useful for reproducibility.
scorer : str, optional
Method to compute a score reflecting how well the model has fit the input corpus, allowed values are:
* 'perplexity': Perplexity of language model
* 'mass_u': Use :class:`~gensim.models.coherencemodel.CoherenceModel` to compute a topics coherence.
dtype : {numpy.float16, numpy.float32, numpy.float64}, optional
Data-type to use during calculations inside model. All inputs are also converted.
Notes
-----
Configure `passes` and `update_every` params to choose the mode among:
* online (single-pass): update_every != None and passes == 1
* online (multi-pass): update_every != None and passes > 1
* batch: update_every == None
By default, 'online (single-pass)' mode is used for training the LDA model.
"""
self.gensim_model = None
self.num_topics = num_topics
self.id2word = id2word
self.chunksize = chunksize
self.passes = passes
self.update_every = update_every
self.alpha = alpha
self.eta = eta
self.decay = decay
self.offset = offset
self.eval_every = eval_every
self.iterations = iterations
self.gamma_threshold = gamma_threshold
self.minimum_probability = minimum_probability
self.random_state = random_state
self.scorer = scorer
self.dtype = dtype
def fit(self, X, y=None):
"""Fit the model according to the given training data.
Parameters
----------
X : {iterable of iterable of (int, int), scipy.sparse matrix}
A collection of documents in BOW format used for training the model.
Returns
-------
:class:`~gensim.sklearn_api.ldamodel.LdaTransformer`
The trained model.
"""
if sparse.issparse(X):
corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False)
else:
corpus = X
self.gensim_model = models.LdaModel(
corpus=corpus, num_topics=self.num_topics, id2word=self.id2word,
chunksize=self.chunksize, passes=self.passes, update_every=self.update_every,
alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset,
eval_every=self.eval_every, iterations=self.iterations,
gamma_threshold=self.gamma_threshold, minimum_probability=self.minimum_probability,
random_state=self.random_state, dtype=self.dtype
)
return self
def transform(self, docs):
"""Infer the topic distribution for `docs`.
Parameters
----------
docs : {iterable of list of (int, number), list of (int, number)}
Document or sequence of documents in BoW format.
Returns
-------
numpy.ndarray of shape [`len(docs)`, `num_topics`]
The topic distribution for each input document.
"""
if self.gensim_model is None:
raise NotFittedError(
"This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
)
# The input as array of array
if isinstance(docs[0], tuple):
docs = [docs]
# returning dense representation for compatibility with sklearn
# but we should go back to sparse representation in the future
distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
return np.reshape(np.array(distribution), (len(docs), self.num_topics))
def partial_fit(self, X):
"""Train model over a potentially incomplete set of documents.
Uses the parameters set in the constructor.
This method can be used in two ways:
* On an unfitted model in which case the model is initialized and trained on `X`.
* On an already fitted model in which case the model is **updated** by `X`.
Parameters
----------
X : {iterable of iterable of (int, int), scipy.sparse matrix}
A collection of documents in BOW format used for training the model.
Returns
-------
:class:`~gensim.sklearn_api.ldamodel.LdaTransformer`
The trained model.
"""
if sparse.issparse(X):
X = matutils.Sparse2Corpus(sparse=X, documents_columns=False)
if self.gensim_model is None:
self.gensim_model = models.LdaModel(
num_topics=self.num_topics, id2word=self.id2word,
chunksize=self.chunksize, passes=self.passes, update_every=self.update_every,
alpha=self.alpha, eta=self.eta, decay=self.decay, offset=self.offset,
eval_every=self.eval_every, iterations=self.iterations, gamma_threshold=self.gamma_threshold,
minimum_probability=self.minimum_probability, random_state=self.random_state,
dtype=self.dtype
)
self.gensim_model.update(corpus=X)
return self
def score(self, X, y=None):
"""Compute score reflecting how well the model has fitted for the input data.
The scoring method is set using the `scorer` argument in :meth:`~gensim.sklearn_api.ldamodel.LdaTransformer`.
Higher score is better.
Parameters
----------
X : iterable of list of (int, number)
Sequence of documents in BOW format.
Returns
-------
float
The score computed based on the selected method.
"""
if self.scorer == 'perplexity':
corpus_words = sum(cnt for document in X for _, cnt in document)
subsample_ratio = 1.0
perwordbound = \
self.gensim_model.bound(X, subsample_ratio=subsample_ratio) / (subsample_ratio * corpus_words)
return -1 * np.exp2(-perwordbound) # returning (-1*perplexity) to select model with minimum value
elif self.scorer == 'u_mass':
goodcm = models.CoherenceModel(model=self.gensim_model, corpus=X, coherence=self.scorer, topn=3)
return goodcm.get_coherence()
else:
raise ValueError("Invalid value {} supplied for `scorer` param".format(self.scorer))