221 lines
11 KiB
Python
221 lines
11 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Author: Chinmaya Pancholi <chinmayapancholi13@gmail.com>
|
|
# Copyright (C) 2017 Radim Rehurek <radimrehurek@seznam.cz>
|
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
|
|
|
"""Scikit learn interface for :class:`~gensim.models.atmodel.AuthorTopicModel`.
|
|
|
|
Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.
|
|
|
|
Examples
|
|
--------
|
|
>>> from gensim.test.utils import common_texts, common_dictionary, common_corpus
|
|
>>> from gensim.sklearn_api.atmodel import AuthorTopicTransformer
|
|
>>>
|
|
>>> # Pass a mapping from authors to the documents they contributed to.
|
|
>>> author2doc = {
|
|
... 'john': [0, 1, 2, 3, 4, 5, 6],
|
|
... 'jane': [2, 3, 4, 5, 6, 7, 8],
|
|
... 'jack': [0, 2, 4, 6, 8]
|
|
... }
|
|
>>>
|
|
>>> # Lets use the model to discover 2 different topics.
|
|
>>> model = AuthorTopicTransformer(id2word=common_dictionary, author2doc=author2doc, num_topics=2, passes=100)
|
|
>>>
|
|
>>> # In which of those 2 topics does jack mostly contribute to?
|
|
>>> topic_dist = model.fit(common_corpus).transform('jack')
|
|
|
|
"""
|
|
import numpy as np
|
|
from sklearn.base import TransformerMixin, BaseEstimator
|
|
from sklearn.exceptions import NotFittedError
|
|
|
|
from gensim import models
|
|
from gensim import matutils
|
|
|
|
|
|
class AuthorTopicTransformer(TransformerMixin, BaseEstimator):
|
|
"""Base Author Topic module, wraps :class:`~gensim.models.atmodel.AuthorTopicModel`.
|
|
|
|
The model's internal workings are heavily based on `"The Author-Topic Model for Authors and Documents",
|
|
Osen-Zvi et. al 2004 <https://mimno.infosci.cornell.edu/info6150/readings/398.pdf>`_.
|
|
|
|
"""
|
|
def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=None,
|
|
chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0,
|
|
alpha='symmetric', eta='symmetric', update_every=1, eval_every=10,
|
|
gamma_threshold=0.001, serialized=False, serialization_path=None,
|
|
minimum_probability=0.01, random_state=None):
|
|
"""
|
|
|
|
Parameters
|
|
----------
|
|
num_topics : int, optional
|
|
Number of requested latent topics to be extracted from the training corpus.
|
|
id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
|
|
Mapping from a words' ID to the word itself. Used to determine the vocabulary size, as well as for debugging
|
|
and topic printing.
|
|
author2doc : dict of (str, list of int), optional
|
|
Maps an authors name to a list of document IDs where has has contributed.
|
|
Either `author2doc` or `doc2author` **must be supplied**.
|
|
doc2author : dict of (int, list of str)
|
|
Maps a document (using its ID) to a list of author names that contributed to it.
|
|
Either `author2doc` or `doc2author` **must be supplied**.
|
|
chunksize : int, optional
|
|
Number of documents to be processed by the model in each mini-batch.
|
|
passes : int, optional
|
|
Number of times the model can make a pass over the corpus during training.
|
|
iterations : int, optional
|
|
Maximum number of times the model before convergence during the M step of the EM algorithm.
|
|
decay : float, optional
|
|
A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten
|
|
when each new document is examined. Corresponds to Kappa from `"The Author-Topic Model for Authors
|
|
and Documents", Osen-Zvi et. al 2004 <https://mimno.infosci.cornell.edu/info6150/readings/398.pdf>`_.
|
|
offset : float, optional
|
|
Hyper-parameter that controls how much we will slow down the first steps the first few iterations.
|
|
Corresponds to Tau_0 from `"The Author-Topic Model for Authors and Documents", Osen-Zvi et. al 2004
|
|
<https://mimno.infosci.cornell.edu/info6150/readings/398.pdf>`_.
|
|
alpha : {np.ndarray, str}, optional
|
|
Can be set to an 1D array of length equal to the number of expected topics that expresses
|
|
our a-priori belief for the each topics' probability.
|
|
Alternatively default prior selecting strategies can be employed by supplying a string:
|
|
|
|
* 'asymmetric': Uses a fixed normalized assymetric prior of `1.0 / topicno`.
|
|
* 'default': Learns an assymetric prior from the corpus.
|
|
eta : {float, np.array, str}, optional
|
|
A-priori belief on word probability, this can be:
|
|
|
|
* scalar for a symmetric prior over topic/word probability,
|
|
* vector of length num_words to denote an asymmetric user defined probability for each word,
|
|
* matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination,
|
|
* the string 'auto' to learn the asymmetric prior from the data.
|
|
update_every : int, optional
|
|
Number of mini-batches between each model update.
|
|
eval_every : int, optional
|
|
Number of updates between two log perplexity estimates.
|
|
Set to None to disable perplexity estimation.
|
|
gamma_threshold : float, optional
|
|
Minimum change in the value of the gamma parameters to continue iterating.
|
|
serialized : bool, optional
|
|
Indicates whether the input corpora to the model are simple in-memory lists (`serialized = False`)
|
|
or saved to the hard-drive (`serialized = True`). Note that this behaviour is quite different from
|
|
other Gensim models. If your data is too large to fit in to memory, use this functionality.
|
|
serialization_path : str, optional
|
|
Path to file that used for storing the serialized object, **must be supplied if `serialized = True`**.
|
|
An existing file *cannot* be overwritten, either delete the old file or choose a different name.
|
|
minimum_probability : float, optional
|
|
Topics with a probability lower than this threshold will be filtered out.
|
|
random_state : {np.random.RandomState, int}, optional
|
|
Either a randomState object or a seed to generate one. Useful for reproducibility.
|
|
|
|
"""
|
|
self.gensim_model = None
|
|
self.num_topics = num_topics
|
|
self.id2word = id2word
|
|
self.author2doc = author2doc
|
|
self.doc2author = doc2author
|
|
self.chunksize = chunksize
|
|
self.passes = passes
|
|
self.iterations = iterations
|
|
self.decay = decay
|
|
self.offset = offset
|
|
self.alpha = alpha
|
|
self.eta = eta
|
|
self.update_every = update_every
|
|
self.eval_every = eval_every
|
|
self.gamma_threshold = gamma_threshold
|
|
self.serialized = serialized
|
|
self.serialization_path = serialization_path
|
|
self.minimum_probability = minimum_probability
|
|
self.random_state = random_state
|
|
|
|
def fit(self, X, y=None):
|
|
"""Fit the model according to the given training data.
|
|
|
|
Parameters
|
|
----------
|
|
X : iterable of list of (int, number)
|
|
Sequence of documents in BoW format.
|
|
|
|
Returns
|
|
-------
|
|
:class:`~gensim.sklearn_api.atmodel.AuthorTopicTransformer`
|
|
The trained model.
|
|
|
|
"""
|
|
self.gensim_model = models.AuthorTopicModel(
|
|
corpus=X, num_topics=self.num_topics, id2word=self.id2word,
|
|
author2doc=self.author2doc, doc2author=self.doc2author, chunksize=self.chunksize, passes=self.passes,
|
|
iterations=self.iterations, decay=self.decay, offset=self.offset, alpha=self.alpha, eta=self.eta,
|
|
update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold,
|
|
serialized=self.serialized, serialization_path=self.serialization_path,
|
|
minimum_probability=self.minimum_probability, random_state=self.random_state
|
|
)
|
|
return self
|
|
|
|
def transform(self, author_names):
|
|
"""Infer the topic probabilities for each author.
|
|
|
|
Parameters
|
|
----------
|
|
author_names : {iterable of str, str}
|
|
Author name or sequence of author names whose topics will be identified.
|
|
|
|
Returns
|
|
-------
|
|
numpy.ndarray
|
|
Topic distribution for each input author.
|
|
|
|
"""
|
|
if self.gensim_model is None:
|
|
raise NotFittedError(
|
|
"This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
|
|
)
|
|
|
|
# The input as array of arrays
|
|
if not isinstance(author_names, list):
|
|
author_names = [author_names]
|
|
# returning dense representation for compatibility with sklearn
|
|
# but we should go back to sparse representation in the future
|
|
topics = [matutils.sparse2full(self.gensim_model[author_name], self.num_topics) for author_name in author_names]
|
|
return np.reshape(np.array(topics), (len(author_names), self.num_topics))
|
|
|
|
def partial_fit(self, X, author2doc=None, doc2author=None):
|
|
"""Train model over a potentially incomplete set of documents.
|
|
|
|
This method can be used in two ways:
|
|
* On an unfitted model in which case the model is initialized and trained on `X`.
|
|
* On an already fitted model in which case the model is **updated** by `X`.
|
|
|
|
|
|
Parameters
|
|
----------
|
|
X : iterable of list of (int, number)
|
|
Sequence of documents in BoW format.
|
|
author2doc : dict of (str, list of int), optional
|
|
Maps an authors name to a list of document IDs where has has contributed.
|
|
Either `author2doc` or `doc2author` **must be supplied**.
|
|
doc2author : dict of (int, list of str)
|
|
Maps a document (using its ID) to a list of author names that contributed to it.
|
|
Either `author2doc` or `doc2author` **must be supplied**.
|
|
|
|
Returns
|
|
-------
|
|
:class:`~gensim.sklearn_api.atmodel.AuthorTopicTransformer`
|
|
The trained model.
|
|
|
|
"""
|
|
if self.gensim_model is None:
|
|
self.gensim_model = models.AuthorTopicModel(
|
|
corpus=X, num_topics=self.num_topics, id2word=self.id2word,
|
|
author2doc=self.author2doc, doc2author=self.doc2author, chunksize=self.chunksize, passes=self.passes,
|
|
iterations=self.iterations, decay=self.decay, offset=self.offset, alpha=self.alpha, eta=self.eta,
|
|
update_every=self.update_every, eval_every=self.eval_every, gamma_threshold=self.gamma_threshold,
|
|
serialized=self.serialized, serialization_path=self.serialization_path,
|
|
minimum_probability=self.minimum_probability, random_state=self.random_state
|
|
)
|
|
|
|
self.gensim_model.update(corpus=X, author2doc=author2doc, doc2author=doc2author)
|
|
return self
|