laywerrobot/lib/python3.6/site-packages/gensim/sklearn_api/phrases.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""Scikit learn interface for `gensim.models.phrases.Phrases`.

Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.

Examples
--------
>>> from gensim.sklearn_api.phrases import PhrasesTransformer
>>>
>>> # Create the model. Make sure no term is ignored and combinations seen 3+ times are captured.
>>> m = PhrasesTransformer(min_count=1, threshold=3)
>>> texts = [
...   ['I', 'love', 'computer', 'science'],
...   ['computer', 'science', 'is', 'my', 'passion'],
...   ['I', 'studied', 'computer', 'science']
... ]
>>>
>>> # Use sklearn fit_transform to see the transformation.
>>> # Since computer and science were seen together 3+ times they are considered a phrase.
>>> assert ['I', 'love', 'computer_science'] == m.fit_transform(texts)[0]

"""
from six import string_types
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError

from gensim import models


class PhrasesTransformer(TransformerMixin, BaseEstimator):
    """Base Phrases module, wraps :class:`~gensim.models.phrases.Phrases`.

    For more information, please have a look to `Mikolov, et. al: "Distributed Representations
    of Words and Phrases and their Compositionality" <https://arxiv.org/abs/1310.4546>`_ and
    `Gerlof Bouma: "Normalized (Pointwise) Mutual Information in Collocation Extraction"
    <https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf>`_.

    """
    def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000,
                 delimiter=b'_', progress_per=10000, scoring='default'):
        """

        Parameters
        ----------
        min_count : int, optional
            Terms with a count lower than this will be ignored
        threshold : float, optional
            Only phrases scoring above this will be accepted, see `scoring` below.
        max_vocab_size : int, optional
            Maximum size of the vocabulary. Used to control pruning of less common words, to keep memory under control.
            The default of 40M needs about 3.6GB of RAM.
        delimiter : str, optional
            Character used to join collocation tokens, should be a byte string (e.g. b'_').
        progress_per : int, optional
            Training will report to the logger every that many phrases are learned.
        scoring : str or function, optional
            Specifies how potential phrases are scored for comparison to the `threshold`
            setting. `scoring` can be set with either a string that refers to a built-in scoring function,
            or with a function with the expected parameter names. Two built-in scoring functions are available
            by setting `scoring` to a string:

                * 'default': `Mikolov, et. al: "Distributed Representations of Words and Phrases
                  and their Compositionality" <https://arxiv.org/abs/1310.4546>`_.
                * 'npmi': Explained in `Gerlof Bouma: "Normalized (Pointwise) Mutual Information in Collocation
                  Extraction" <https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf>`_.

            'npmi' is more robust when dealing with common words that form part of common bigrams, and
            ranges from -1 to 1, but is slower to calculate than the default.

            To use a custom scoring function, create a function with the following parameters and set the `scoring`
            parameter to the custom function, see :func:`~gensim.models.phrases.original_scorer` as example.
            You must define all the parameters (but can use only part of it):

                * worda_count: number of occurrences in `sentences` of the first token in the phrase being scored
                * wordb_count: number of occurrences in `sentences` of the second token in the phrase being scored
                * bigram_count: number of occurrences in `sentences` of the phrase being scored
                * len_vocab: the number of unique tokens in `sentences`
                * min_count: the `min_count` setting of the Phrases class
                * corpus_word_count: the total number of (non-unique) tokens in `sentences`

            A scoring function without any of these parameters (even if the parameters are not used) will
            raise a ValueError on initialization of the Phrases class. The scoring function must be pickleable.

        """
        self.gensim_model = None
        self.min_count = min_count
        self.threshold = threshold
        self.max_vocab_size = max_vocab_size
        self.delimiter = delimiter
        self.progress_per = progress_per
        self.scoring = scoring

    def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : iterable of list of str
            Sequence of sentences to be used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.phrases.PhrasesTransformer`
            The trained model.

        """
        self.gensim_model = models.Phrases(
            sentences=X, min_count=self.min_count, threshold=self.threshold,
            max_vocab_size=self.max_vocab_size, delimiter=self.delimiter,
            progress_per=self.progress_per, scoring=self.scoring
        )
        return self

    def transform(self, docs):
        """Transform the input documents into phrase tokens.

        Words in the sentence will be joined by `self.delimiter`.

        Parameters
        ----------
        docs : {iterable of list of str, list of str}
            Sequence of documents to be used transformed.

        Returns
        -------
        iterable of str
            Phrase representation for each of the input sentences.

        """
        if self.gensim_model is None:
            raise NotFittedError(
                "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
            )

        # input as python lists
        if isinstance(docs[0], string_types):
            docs = [docs]
        return [self.gensim_model[doc] for doc in docs]

    def partial_fit(self, X):
        """Train model over a potentially incomplete set of sentences.

        This method can be used in two ways:
            1. On an unfitted model in which case the model is initialized and trained on `X`.
            2. On an already fitted model in which case the X sentences are **added** to the vocabulary.

        Parameters
        ----------
        X : iterable of list of str
            Sequence of sentences to be used for training the model.

        Returns
        -------
        :class:`~gensim.sklearn_api.phrases.PhrasesTransformer`
            The trained model.

        """
        if self.gensim_model is None:
            self.gensim_model = models.Phrases(
                sentences=X, min_count=self.min_count, threshold=self.threshold,
                max_vocab_size=self.max_vocab_size, delimiter=self.delimiter,
                progress_per=self.progress_per, scoring=self.scoring
            )

        self.gensim_model.add_vocab(X)
        return self