laywerrobot/lib/python3.6/site-packages/gensim/sklearn_api/phrases.py
2020-08-27 21:55:39 +02:00

171 lines
7.2 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Scikit learn interface for `gensim.models.phrases.Phrases`.
Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.
Examples
--------
>>> from gensim.sklearn_api.phrases import PhrasesTransformer
>>>
>>> # Create the model. Make sure no term is ignored and combinations seen 3+ times are captured.
>>> m = PhrasesTransformer(min_count=1, threshold=3)
>>> texts = [
... ['I', 'love', 'computer', 'science'],
... ['computer', 'science', 'is', 'my', 'passion'],
... ['I', 'studied', 'computer', 'science']
... ]
>>>
>>> # Use sklearn fit_transform to see the transformation.
>>> # Since computer and science were seen together 3+ times they are considered a phrase.
>>> assert ['I', 'love', 'computer_science'] == m.fit_transform(texts)[0]
"""
from six import string_types
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError
from gensim import models
class PhrasesTransformer(TransformerMixin, BaseEstimator):
"""Base Phrases module, wraps :class:`~gensim.models.phrases.Phrases`.
For more information, please have a look to `Mikolov, et. al: "Distributed Representations
of Words and Phrases and their Compositionality" <https://arxiv.org/abs/1310.4546>`_ and
`Gerlof Bouma: "Normalized (Pointwise) Mutual Information in Collocation Extraction"
<https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf>`_.
"""
def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000,
delimiter=b'_', progress_per=10000, scoring='default'):
"""
Parameters
----------
min_count : int, optional
Terms with a count lower than this will be ignored
threshold : float, optional
Only phrases scoring above this will be accepted, see `scoring` below.
max_vocab_size : int, optional
Maximum size of the vocabulary. Used to control pruning of less common words, to keep memory under control.
The default of 40M needs about 3.6GB of RAM.
delimiter : str, optional
Character used to join collocation tokens, should be a byte string (e.g. b'_').
progress_per : int, optional
Training will report to the logger every that many phrases are learned.
scoring : str or function, optional
Specifies how potential phrases are scored for comparison to the `threshold`
setting. `scoring` can be set with either a string that refers to a built-in scoring function,
or with a function with the expected parameter names. Two built-in scoring functions are available
by setting `scoring` to a string:
* 'default': `Mikolov, et. al: "Distributed Representations of Words and Phrases
and their Compositionality" <https://arxiv.org/abs/1310.4546>`_.
* 'npmi': Explained in `Gerlof Bouma: "Normalized (Pointwise) Mutual Information in Collocation
Extraction" <https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf>`_.
'npmi' is more robust when dealing with common words that form part of common bigrams, and
ranges from -1 to 1, but is slower to calculate than the default.
To use a custom scoring function, create a function with the following parameters and set the `scoring`
parameter to the custom function, see :func:`~gensim.models.phrases.original_scorer` as example.
You must define all the parameters (but can use only part of it):
* worda_count: number of occurrences in `sentences` of the first token in the phrase being scored
* wordb_count: number of occurrences in `sentences` of the second token in the phrase being scored
* bigram_count: number of occurrences in `sentences` of the phrase being scored
* len_vocab: the number of unique tokens in `sentences`
* min_count: the `min_count` setting of the Phrases class
* corpus_word_count: the total number of (non-unique) tokens in `sentences`
A scoring function without any of these parameters (even if the parameters are not used) will
raise a ValueError on initialization of the Phrases class. The scoring function must be pickleable.
"""
self.gensim_model = None
self.min_count = min_count
self.threshold = threshold
self.max_vocab_size = max_vocab_size
self.delimiter = delimiter
self.progress_per = progress_per
self.scoring = scoring
def fit(self, X, y=None):
"""Fit the model according to the given training data.
Parameters
----------
X : iterable of list of str
Sequence of sentences to be used for training the model.
Returns
-------
:class:`~gensim.sklearn_api.phrases.PhrasesTransformer`
The trained model.
"""
self.gensim_model = models.Phrases(
sentences=X, min_count=self.min_count, threshold=self.threshold,
max_vocab_size=self.max_vocab_size, delimiter=self.delimiter,
progress_per=self.progress_per, scoring=self.scoring
)
return self
def transform(self, docs):
"""Transform the input documents into phrase tokens.
Words in the sentence will be joined by `self.delimiter`.
Parameters
----------
docs : {iterable of list of str, list of str}
Sequence of documents to be used transformed.
Returns
-------
iterable of str
Phrase representation for each of the input sentences.
"""
if self.gensim_model is None:
raise NotFittedError(
"This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
)
# input as python lists
if isinstance(docs[0], string_types):
docs = [docs]
return [self.gensim_model[doc] for doc in docs]
def partial_fit(self, X):
"""Train model over a potentially incomplete set of sentences.
This method can be used in two ways:
1. On an unfitted model in which case the model is initialized and trained on `X`.
2. On an already fitted model in which case the X sentences are **added** to the vocabulary.
Parameters
----------
X : iterable of list of str
Sequence of sentences to be used for training the model.
Returns
-------
:class:`~gensim.sklearn_api.phrases.PhrasesTransformer`
The trained model.
"""
if self.gensim_model is None:
self.gensim_model = models.Phrases(
sentences=X, min_count=self.min_count, threshold=self.threshold,
max_vocab_size=self.max_vocab_size, delimiter=self.delimiter,
progress_per=self.progress_per, scoring=self.scoring
)
self.gensim_model.add_vocab(X)
return self