laywerrobot/lib/python3.6/site-packages/gensim/sklearn_api/atmodel.py
2020-08-27 21:55:39 +02:00

221 lines
11 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Chinmaya Pancholi <chinmayapancholi13@gmail.com>
# Copyright (C) 2017 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Scikit learn interface for :class:`~gensim.models.atmodel.AuthorTopicModel`.
Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.
Examples
--------
>>> from gensim.test.utils import common_texts, common_dictionary, common_corpus
>>> from gensim.sklearn_api.atmodel import AuthorTopicTransformer
>>>
>>> # Pass a mapping from authors to the documents they contributed to.
>>> author2doc = {
... 'john': [0, 1, 2, 3, 4, 5, 6],
... 'jane': [2, 3, 4, 5, 6, 7, 8],
... 'jack': [0, 2, 4, 6, 8]
... }
>>>
>>> # Lets use the model to discover 2 different topics.
>>> model = AuthorTopicTransformer(id2word=common_dictionary, author2doc=author2doc, num_topics=2, passes=100)
>>>
>>> # In which of those 2 topics does jack mostly contribute to?
>>> topic_dist = model.fit(common_corpus).transform('jack')
"""
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError
from gensim import models
from gensim import matutils
class AuthorTopicTransformer(TransformerMixin, BaseEstimator):
"""Base Author Topic module, wraps :class:`~gensim.models.atmodel.AuthorTopicModel`.
The model's internal workings are heavily based on `"The Author-Topic Model for Authors and Documents",
Osen-Zvi et. al 2004 <https://mimno.infosci.cornell.edu/info6150/readings/398.pdf>`_.
"""
def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=None,
chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0,
alpha='symmetric', eta='symmetric', update_every=1, eval_every=10,
gamma_threshold=0.001, serialized=False, serialization_path=None,
minimum_probability=0.01, random_state=None):
"""
Parameters
----------
num_topics : int, optional
Number of requested latent topics to be extracted from the training corpus.
id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
Mapping from a words' ID to the word itself. Used to determine the vocabulary size, as well as for debugging
and topic printing.
author2doc : dict of (str, list of int), optional
Maps an authors name to a list of document IDs where has has contributed.
Either `author2doc` or `doc2author` **must be supplied**.
doc2author : dict of (int, list of str)
Maps a document (using its ID) to a list of author names that contributed to it.
Either `author2doc` or `doc2author` **must be supplied**.
chunksize : int, optional
Number of documents to be processed by the model in each mini-batch.
passes : int, optional
Number of times the model can make a pass over the corpus during training.
iterations : int, optional
Maximum number of times the model before convergence during the M step of the EM algorithm.
decay : float, optional
A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten
when each new document is examined. Corresponds to Kappa from `"The Author-Topic Model for Authors
and Documents", Osen-Zvi et. al 2004 <https://mimno.infosci.cornell.edu/info6150/readings/398.pdf>`_.
offset : float, optional
Hyper-parameter that controls how much we will slow down the first steps the first few iterations.
Corresponds to Tau_0 from `"The Author-Topic Model for Authors and Documents", Osen-Zvi et. al 2004
<https://mimno.infosci.cornell.edu/info6150/readings/398.pdf>`_.
alpha : {np.ndarray, str}, optional
Can be set to an 1D array of length equal to the number of expected topics that expresses
our a-priori belief for the each topics' probability.
Alternatively default prior selecting strategies can be employed by supplying a string:
* 'asymmetric': Uses a fixed normalized assymetric prior of `1.0 / topicno`.
* 'default': Learns an assymetric prior from the corpus.
eta : {float, np.array, str}, optional
A-priori belief on word probability, this can be:
* scalar for a symmetric prior over topic/word probability,
* vector of length num_words to denote an asymmetric user defined probability for each word,
* matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination,
* the string 'auto' to learn the asymmetric prior from the data.
update_every : int, optional
Number of mini-batches between each model update.
eval_every : int, optional
Number of updates between two log perplexity estimates.
Set to None to disable perplexity estimation.
gamma_threshold : float, optional
Minimum change in the value of the gamma parameters to continue iterating.
serialized : bool, optional
Indicates whether the input corpora to the model are simple in-memory lists (`serialized = False`)
or saved to the hard-drive (`serialized = True`). Note that this behaviour is quite different from
other Gensim models. If your data is too large to fit in to memory, use this functionality.
serialization_path : str, optional
Path to file that used for storing the serialized object, **must be supplied if `serialized = True`**.
An existing file *cannot* be overwritten, either delete the old file or choose a different name.
minimum_probability : float, optional
Topics with a probability lower than this threshold will be filtered out.
random_state : {np.random.RandomState, int}, optional
Either a randomState object or a seed to generate one. Useful for reproducibility.
"""
self.gensim_model = None
self.num_topics = num_topics
self.id2word = id2word
self.author2doc = author2doc
self.doc2author = doc2author
self.chunksize = chunksize
self.passes = passes
self.iterations = iterations
self.decay = decay
self.offset = offset
self.alpha = alpha
self.eta = eta
self.update_every = update_every
self.eval_every = eval_every
self.gamma_threshold = gamma_threshold
self.serialized = serialized
self.serialization_path = serialization_path
self.minimum_probability = minimum_probability
self.random_state = random_state
def fit(self, X, y=None):
"""Fit the model according to the given training data.
Parameters
----------
X : iterable of list of (int, number)
Sequence of documents in BoW format.
Returns
-------
:class:`~gensim.sklearn_api.atmodel.AuthorTopicTransformer`
The trained model.
"""
self.gensim_model = models.AuthorTopicModel(
corpus=X, num_topics=self.num_topics, id2word=self.id2word,
author2doc=self.author2doc, doc2author=self.doc2author, chunksize=self.chunksize, passes=self.passes,
iterations=self.iterations, decay=self.decay, offset=self.offset, alpha=self.alpha, eta=self.eta,
update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold,
serialized=self.serialized, serialization_path=self.serialization_path,
minimum_probability=self.minimum_probability, random_state=self.random_state
)
return self
def transform(self, author_names):
"""Infer the topic probabilities for each author.
Parameters
----------
author_names : {iterable of str, str}
Author name or sequence of author names whose topics will be identified.
Returns
-------
numpy.ndarray
Topic distribution for each input author.
"""
if self.gensim_model is None:
raise NotFittedError(
"This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
)
# The input as array of arrays
if not isinstance(author_names, list):
author_names = [author_names]
# returning dense representation for compatibility with sklearn
# but we should go back to sparse representation in the future
topics = [matutils.sparse2full(self.gensim_model[author_name], self.num_topics) for author_name in author_names]
return np.reshape(np.array(topics), (len(author_names), self.num_topics))
def partial_fit(self, X, author2doc=None, doc2author=None):
"""Train model over a potentially incomplete set of documents.
This method can be used in two ways:
* On an unfitted model in which case the model is initialized and trained on `X`.
* On an already fitted model in which case the model is **updated** by `X`.
Parameters
----------
X : iterable of list of (int, number)
Sequence of documents in BoW format.
author2doc : dict of (str, list of int), optional
Maps an authors name to a list of document IDs where has has contributed.
Either `author2doc` or `doc2author` **must be supplied**.
doc2author : dict of (int, list of str)
Maps a document (using its ID) to a list of author names that contributed to it.
Either `author2doc` or `doc2author` **must be supplied**.
Returns
-------
:class:`~gensim.sklearn_api.atmodel.AuthorTopicTransformer`
The trained model.
"""
if self.gensim_model is None:
self.gensim_model = models.AuthorTopicModel(
corpus=X, num_topics=self.num_topics, id2word=self.id2word,
author2doc=self.author2doc, doc2author=self.doc2author, chunksize=self.chunksize, passes=self.passes,
iterations=self.iterations, decay=self.decay, offset=self.offset, alpha=self.alpha, eta=self.eta,
update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold,
serialized=self.serialized, serialization_path=self.serialization_path,
minimum_probability=self.minimum_probability, random_state=self.random_state
)
self.gensim_model.update(corpus=X, author2doc=author2doc, doc2author=doc2author)
return self