laywerrobot/lib/python3.6/site-packages/gensim/sklearn_api/lsimodel.py

163 lines
6 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Chinmaya Pancholi <chinmayapancholi13@gmail.com>
# Copyright (C) 2017 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Scikit learn interface for :class:`gensim.models.lsimodel.LsiModel`.
Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.
Examples
--------
Integrate with sklearn Pipelines:
>>> from sklearn.pipeline import Pipeline
>>> from sklearn import linear_model
>>> from gensim.test.utils import common_corpus, common_dictionary
>>> from gensim.sklearn_api import LsiTransformer
>>>
>>> # Create stages for our pipeline (including gensim and sklearn models alike).
>>> model = LsiTransformer(num_topics=15, id2word=common_dictionary)
>>> clf = linear_model.LogisticRegression(penalty='l2', C=0.1)
>>> pipe = Pipeline([('features', model,), ('classifier', clf)])
>>>
>>> # Create some random binary labels for our documents.
>>> labels = np.random.choice([0, 1], len(common_corpus))
>>>
>>> # How well does our pipeline perform on the training set?
>>> score = pipe.fit(common_corpus, labels).score(common_corpus, labels)
"""
import numpy as np
from scipy import sparse
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError
from gensim import models
from gensim import matutils
class LsiTransformer(TransformerMixin, BaseEstimator):
"""Base LSI module, wraps :class:`~gensim.models.lsimodel.LsiModel`.
For more information please have a look to `Latent semantic analysis
<https://en.wikipedia.org/wiki/Latent_semantic_analysis>`_.
"""
def __init__(self, num_topics=200, id2word=None, chunksize=20000,
decay=1.0, onepass=True, power_iters=2, extra_samples=100):
"""
Parameters
----------
num_topics : int, optional
Number of requested factors (latent dimensions).
id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
ID to word mapping, optional.
chunksize : int, optional
Number of documents to be used in each training chunk.
decay : float, optional
Weight of existing observations relatively to new ones.
onepass : bool, optional
Whether the one-pass algorithm should be used for training, pass `False` to force a
multi-pass stochastic algorithm.
power_iters: int, optional
Number of power iteration steps to be used.
Increasing the number of power iterations improves accuracy, but lowers performance.
extra_samples : int, optional
Extra samples to be used besides the rank `k`. Can improve accuracy.
"""
self.gensim_model = None
self.num_topics = num_topics
self.id2word = id2word
self.chunksize = chunksize
self.decay = decay
self.onepass = onepass
self.extra_samples = extra_samples
self.power_iters = power_iters
def fit(self, X, y=None):
"""Fit the model according to the given training data.
Parameters
----------
X : {iterable of list of (int, number), scipy.sparse matrix}
A collection of documents in BOW format to be transformed.
Returns
-------
:class:`~gensim.sklearn_api.lsimodel.LsiTransformer`
The trained model.
"""
if sparse.issparse(X):
corpus = matutils.Sparse2Corpus(sparse=X, documents_columns=False)
else:
corpus = X
self.gensim_model = models.LsiModel(
corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize,
decay=self.decay, onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples
)
return self
def transform(self, docs):
"""Computes the latent factors for `docs`.
Parameters
----------
docs : {iterable of list of (int, number), list of (int, number), scipy.sparse matrix}
Document or collection of documents in BOW format to be transformed.
Returns
-------
numpy.ndarray of shape [`len(docs)`, `num_topics`]
Topic distribution matrix.
"""
if self.gensim_model is None:
raise NotFittedError(
"This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
)
# The input as array of array
if isinstance(docs[0], tuple):
docs = [docs]
# returning dense representation for compatibility with sklearn
# but we should go back to sparse representation in the future
distribution = [matutils.sparse2full(self.gensim_model[doc], self.num_topics) for doc in docs]
return np.reshape(np.array(distribution), (len(docs), self.num_topics))
def partial_fit(self, X):
"""Train model over a potentially incomplete set of documents.
This method can be used in two ways:
1. On an unfitted model in which case the model is initialized and trained on `X`.
2. On an already fitted model in which case the model is **further** trained on `X`.
Parameters
----------
X : {iterable of list of (int, number), scipy.sparse matrix}
Stream of document vectors or sparse matrix of shape: [`num_terms`, `num_documents`].
Returns
-------
:class:`~gensim.sklearn_api.lsimodel.LsiTransformer`
The trained model.
"""
if sparse.issparse(X):
X = matutils.Sparse2Corpus(sparse=X, documents_columns=False)
if self.gensim_model is None:
self.gensim_model = models.LsiModel(
num_topics=self.num_topics, id2word=self.id2word, chunksize=self.chunksize, decay=self.decay,
onepass=self.onepass, power_iters=self.power_iters, extra_samples=self.extra_samples
)
self.gensim_model.add_documents(corpus=X)
return self