laywerrobot/lib/python3.6/site-packages/gensim/models/tfidfmodel.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 Radim Rehurek <radimrehurek@seznam.cz>
# Copyright (C) 2017 Mohit Rathore <mrmohitrathoremr@gmail.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""This module implements functionality related to the `Term Frequency - Inverse Document Frequency
<https://en.wikipedia.org/wiki/Tf%E2%80%93idf>` vector space bag-of-words models.

For a more in-depth exposition of TF-IDF and its various SMART variants (normalization, weighting schemes),
see the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/

"""

import logging
from functools import partial

from gensim import interfaces, matutils, utils
from six import iteritems

import numpy as np

logger = logging.getLogger(__name__)


def resolve_weights(smartirs):
    """Check the validity of `smartirs` parameters.

    Parameters
    ----------
    smartirs : str
        `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text)
        Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting
        variants in the vector space model. The mnemonic for representing a combination
        of weights takes the form ddd, where the letters represents the term weighting of the document vector.
        for more information visit `SMART Information Retrieval System
        <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.

    Returns
    -------
    3-tuple (local_letter, global_letter, normalization_letter)

    local_letter : str
        Term frequency weighing, one of:
            * `n` - natural,
            * `l` - logarithm,
            * `a` - augmented,
            * `b` - boolean,
            * `L` - log average.
    global_letter : str
        Document frequency weighting, one of:
            * `n` - none,
            * `t` - idf,
            * `p` - prob idf.
    normalization_letter : str
        Document normalization, one of:
            * `n` - none,
            * `c` - cosine.

    Raises
    ------
    ValueError
        If `smartirs` is not a string of length 3 or one of the decomposed value
        doesn't fit the list of permissible values.

    """
    if not isinstance(smartirs, str) or len(smartirs) != 3:
        raise ValueError("Expected a string of length 3 except got " + smartirs)

    w_tf, w_df, w_n = smartirs

    if w_tf not in 'nlabL':
        raise ValueError("Expected term frequency weight to be one of 'nlabL', except got {}".format(w_tf))

    if w_df not in 'ntp':
        raise ValueError("Expected inverse document frequency weight to be one of 'ntp', except got {}".format(w_df))

    if w_n not in 'nc':
        raise ValueError("Expected normalization weight to be one of 'ncb', except got {}".format(w_n))

    return w_tf, w_df, w_n


def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
    """Compute inverse-document-frequency for a term with the given document frequency `docfreq`:
    :math:`idf = add + log_{log\_base} \\frac{totaldocs}{docfreq}`

    Parameters
    ----------
    docfreq : {int, float}
        Document frequency.
    totaldocs : int
        Total number of documents.
    log_base : float, optional
        Base of logarithm.
    add : float, optional
        Offset.

    Returns
    -------
    float
        Inverse document frequency.

    """
    return add + np.log(float(totaldocs) / docfreq) / np.log(log_base)


def precompute_idfs(wglobal, dfs, total_docs):
    """Pre-compute the inverse document frequency mapping for all terms.

    Parameters
    ----------
    wglobal : function
        Custom function for calculating the "global" weighting function.
        See for example the SMART alternatives under :func:`~gensim.models.tfidfmodel.smartirs_wglobal`.
    dfs : dict
        Dictionary mapping `term_id` into how many documents did that term appear in.
    total_docs : int
        Total number of documents.

    Returns
    -------
    dict of (int, float)
        Inverse document frequencies in the format `{term_id_1: idfs_1, term_id_2: idfs_2, ...}`.

    """
    # not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
    # this method is here just to speed things up a little.
    return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)}


def smartirs_wlocal(tf, local_scheme):
    """Calculate local term weight for a term using the weighting scheme specified in `local_scheme`.

    Parameters
    ----------
    tf : int
        Term frequency.
    local : {'n', 'l', 'a', 'b', 'L'}
        Local transformation scheme.

    Returns
    -------
    float
        Calculated local weight.

    """
    if local_scheme == "n":
        return tf
    elif local_scheme == "l":
        return 1 + np.log2(tf)
    elif local_scheme == "a":
        return 0.5 + (0.5 * tf / tf.max(axis=0))
    elif local_scheme == "b":
        return tf.astype('bool').astype('int')
    elif local_scheme == "L":
        return (1 + np.log2(tf)) / (1 + np.log2(tf.mean(axis=0)))


def smartirs_wglobal(docfreq, totaldocs, global_scheme):
    """Calculate global document weight based on the weighting scheme specified in `global_scheme`.

    Parameters
    ----------
    docfreq : int
        Document frequency.
    totaldocs : int
        Total number of documents.
    global_scheme : {'n', 't', 'p'}
        Global transformation scheme.

    Returns
    -------
    float
        Calculated global weight.

    """

    if global_scheme == "n":
        return 1.
    elif global_scheme == "t":
        return np.log2(1.0 * totaldocs / docfreq)
    elif global_scheme == "p":
        return max(0, np.log2((1.0 * totaldocs - docfreq) / docfreq))


def smartirs_normalize(x, norm_scheme, return_norm=False):
    """Normalize a vector using the normalization scheme specified in `norm_scheme`.

    Parameters
    ----------
    x : numpy.ndarray
        Input array
    norm_scheme : {'n', 'c'}
        Normalizing function to use:
        `n`: no normalization
        `c`: unit L2 norm (scale `x` to unit euclidean length)
    return_norm : bool, optional
        Return the length of `x` as well?

    Returns
    -------
    numpy.ndarray
        Normalized array.
    float (only if return_norm is set)
        L2 norm of `x`.

    """
    if norm_scheme == "n":
        if return_norm:
            _, length = matutils.unitvec(x, return_norm=return_norm)
            return x, length
        else:
            return x
    elif norm_scheme == "c":
        return matutils.unitvec(x, return_norm=return_norm)


class TfidfModel(interfaces.TransformationABC):
    """Objects of this class realize the transformation between word-document co-occurrence matrix (int)
    into a locally/globally weighted TF-IDF matrix (positive floats).

    Examples
    --------
    >>> import gensim.downloader as api
    >>> from gensim.models import TfidfModel
    >>> from gensim.corpora import Dictionary
    >>>
    >>> dataset = api.load("text8")
    >>> dct = Dictionary(dataset)  # fit dictionary
    >>> corpus = [dct.doc2bow(line) for line in dataset]  # convert corpus to BoW format
    >>>
    >>> model = TfidfModel(corpus)  # fit model
    >>> vector = model[corpus[0]]  # apply model to the first corpus document

    """
    def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,
                 wglobal=df2idf, normalize=True, smartirs=None, pivot=None, slope=0.65):
        """Compute TF-IDF by multiplying a local component (term frequency) with a global component
        (inverse document frequency), and normalizing the resulting documents to unit length.
        Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents

        .. math:: weight_{i,j} = frequency_{i,j} * log_2 \\frac{D}{document\_freq_{i}}

        or, more generally

        .. math:: weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document\_freq_{i}, D)

        so you can plug in your own custom :math:`wlocal` and :math:`wglobal` functions.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int), optional
            Input corpus
        id2word : {dict, :class:`~gensim.corpora.Dictionary`}, optional
            Mapping token - id, that was used for converting input data to bag of words format.
        dictionary : :class:`~gensim.corpora.Dictionary`
            If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used.
            to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored).
        wlocals : function, optional
            Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity`
            (other options: :func:`math.sqrt`, :func:`math.log1p`, etc).
        wglobal : function, optional
            Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`.
        normalize : bool, optional
            Normalize document vectors to unit euclidean length? You can also inject your own function into `normalize`.
        smartirs : str, optional
            SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System,
            a mnemonic scheme for denoting tf-idf weighting variants in the vector space model.
            The mnemonic for representing a combination of weights takes the form XYZ,
            for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector.

            Term frequency weighing:
                * `n` - natural,
                * `l` - logarithm,
                * `a` - augmented,
                * `b` - boolean,
                * `L` - log average.

            Document frequency weighting:
                * `n` - none,
                * `t` - idf,
                * `p` - prob idf.

            Document normalization:
                * `n` - none,
                * `c` - cosine.

            For more information visit `SMART Information Retrieval System
            <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
        pivot : float, optional
            See the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/.

            Pivot is the point around which the regular normalization curve is `tilted` to get the new pivoted
            normalization curve. In the paper `Amit Singhal, Chris Buckley, Mandar Mitra:
            "Pivoted Document Length Normalization" <http://singhal.info/pivoted-dln.pdf>`_ it is the point where the
            retrieval and relevance curves intersect.

            This parameter along with `slope` is used for pivoted document length normalization.
            Only when `pivot` is not None will pivoted document length normalization be applied.
            Otherwise, regular TfIdf is used.
        slope : float, optional
            Parameter required by pivoted document length normalization which determines the slope to which
            the `old normalization` can be tilted. This parameter only works when pivot is defined.

        """
        self.id2word = id2word
        self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize
        self.num_docs, self.num_nnz, self.idfs = None, None, None
        self.smartirs = smartirs
        self.slope = slope
        self.pivot = pivot
        self.eps = 1e-12

        # If smartirs is not None, override wlocal, wglobal and normalize
        if smartirs is not None:
            n_tf, n_df, n_n = resolve_weights(smartirs)
            self.wlocal = partial(smartirs_wlocal, local_scheme=n_tf)
            self.wglobal = partial(smartirs_wglobal, global_scheme=n_df)
            # also return norm factor if pivot is not none
            if self.pivot is None:
                self.normalize = partial(smartirs_normalize, norm_scheme=n_n)
            else:
                self.normalize = partial(smartirs_normalize, norm_scheme=n_n, return_norm=True)

        if dictionary is not None:
            # user supplied a Dictionary object, which already contains all the
            # statistics we need to construct the IDF mapping. we can skip the
            # step that goes through the corpus (= an optimization).
            if corpus is not None:
                logger.warning(
                    "constructor received both corpus and explicit inverse document frequencies; ignoring the corpus"
                )
            self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
            self.dfs = dictionary.dfs.copy()
            self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
            if id2word is None:
                self.id2word = dictionary
        elif corpus is not None:
            self.initialize(corpus)
        else:
            # NOTE: everything is left uninitialized; presumably the model will
            # be initialized in some other way
            pass

    @classmethod
    def load(cls, *args, **kwargs):
        """Load a previously saved TfidfModel class. Handles backwards compatibility from
        older TfidfModel versions which did not use pivoted document normalization.

        """
        model = super(TfidfModel, cls).load(*args, **kwargs)
        if not hasattr(model, 'pivot'):
            model.pivot = None
            logger.info('older version of %s loaded without pivot arg', cls.__name__)
            logger.info('Setting pivot to %s.', model.pivot)
        if not hasattr(model, 'slope'):
            model.slope = 0.65
            logger.info('older version of %s loaded without slope arg', cls.__name__)
            logger.info('Setting slope to %s.', model.slope)
        return model

    def __str__(self):
        return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz)

    def initialize(self, corpus):
        """Compute inverse document weights, which will be used to modify term frequencies for documents.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
            Input corpus.

        """
        logger.info("collecting document frequencies")
        dfs = {}
        numnnz, docno = 0, -1

        for docno, bow in enumerate(corpus):
            if docno % 10000 == 0:
                logger.info("PROGRESS: processing document #%i", docno)
            numnnz += len(bow)
            for termid, _ in bow:
                dfs[termid] = dfs.get(termid, 0) + 1
        # keep some stats about the training corpus
        self.num_docs = docno + 1
        self.num_nnz = numnnz
        self.dfs = dfs
        # and finally compute the idf weights
        n_features = max(dfs) if dfs else 0
        logger.info(
            "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)",
            self.num_docs, n_features, self.num_nnz
        )
        self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)

    def __getitem__(self, bow, eps=1e-12):
        """Get the tf-idf representation of an input vector and/or corpus.

        bow : {list of (int, int), iterable of iterable of (int, int)}
            Input document in the `sparse Gensim bag-of-words format
            <https://radimrehurek.com/gensim/intro.html#core-concepts>`_,
            or a streamed corpus of such documents.
        eps : float
            Threshold value, will remove all position that have tfidf-value less than `eps`.

        Returns
        -------
        vector : list of (int, float)
            TfIdf vector, if `bow` is a single document
        :class:`~gensim.interfaces.TransformedCorpus`
            TfIdf corpus, if `bow` is a corpus.

        """
        self.eps = eps
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)

        termid_array, tf_array = [], []
        for termid, tf in bow:
            termid_array.append(termid)
            tf_array.append(tf)

        tf_array = self.wlocal(np.array(tf_array))

        vector = [
            (termid, tf * self.idfs.get(termid))
            for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > self.eps
        ]

        if self.normalize is True:
            self.normalize = matutils.unitvec
        elif self.normalize is False:
            self.normalize = utils.identity

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.pivot is None:
            norm_vector = self.normalize(vector)
            norm_vector = [(termid, weight) for termid, weight in norm_vector if abs(weight) > self.eps]
        else:
            _, old_norm = self.normalize(vector, return_norm=True)
            pivoted_norm = (1 - self.slope) * self.pivot + self.slope * old_norm
            norm_vector = [
                (termid, weight / float(pivoted_norm))
                for termid, weight in vector
                if abs(weight / float(pivoted_norm)) > self.eps
            ]
        return norm_vector
first commit 2020-08-27 21:55:39 +02:00			`#!/usr/bin/env python`
			`# -- coding: utf-8 --`
			`#`
			`# Copyright (C) 2012 Radim Rehurek <radimrehurek@seznam.cz>`
			`# Copyright (C) 2017 Mohit Rathore <mrmohitrathoremr@gmail.com>`
			`# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html`

			"""This module implements functionality related to the `Term Frequency - Inverse Document Frequency
			<https://en.wikipedia.org/wiki/Tf%E2%80%93idf>` vector space bag-of-words models.

			`For a more in-depth exposition of TF-IDF and its various SMART variants (normalization, weighting schemes),`
			`see the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/`

			`"""`

			`import logging`
			`from functools import partial`

			`from gensim import interfaces, matutils, utils`
			`from six import iteritems`

			`import numpy as np`

			`logger = logging.getLogger(__name__)`


			`def resolve_weights(smartirs):`
			"""Check the validity of `smartirs` parameters.

			`Parameters`
			`----------`
			`smartirs : str`
			`smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text)
			`Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting`
			`variants in the vector space model. The mnemonic for representing a combination`
			`of weights takes the form ddd, where the letters represents the term weighting of the document vector.`
			for more information visit `SMART Information Retrieval System
			<https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.

			`Returns`
			`-------`
			`3-tuple (local_letter, global_letter, normalization_letter)`

			`local_letter : str`
			`Term frequency weighing, one of:`
			* `n` - natural,
			* `l` - logarithm,
			* `a` - augmented,
			* `b` - boolean,
			* `L` - log average.
			`global_letter : str`
			`Document frequency weighting, one of:`
			* `n` - none,
			* `t` - idf,
			* `p` - prob idf.
			`normalization_letter : str`
			`Document normalization, one of:`
			* `n` - none,
			* `c` - cosine.

			`Raises`
			`------`
			`ValueError`
			If `smartirs` is not a string of length 3 or one of the decomposed value
			`doesn't fit the list of permissible values.`

			`"""`
			`if not isinstance(smartirs, str) or len(smartirs) != 3:`
			`raise ValueError("Expected a string of length 3 except got " + smartirs)`

			`w_tf, w_df, w_n = smartirs`

			`if w_tf not in 'nlabL':`
			`raise ValueError("Expected term frequency weight to be one of 'nlabL', except got {}".format(w_tf))`

			`if w_df not in 'ntp':`
			`raise ValueError("Expected inverse document frequency weight to be one of 'ntp', except got {}".format(w_df))`

			`if w_n not in 'nc':`
			`raise ValueError("Expected normalization weight to be one of 'ncb', except got {}".format(w_n))`

			`return w_tf, w_df, w_n`


			`def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):`
			"""Compute inverse-document-frequency for a term with the given document frequency `docfreq`:
			:math:`idf = add + log_{log\_base} \\frac{totaldocs}{docfreq}`

			`Parameters`
			`----------`
			`docfreq : {int, float}`
			`Document frequency.`
			`totaldocs : int`
			`Total number of documents.`
			`log_base : float, optional`
			`Base of logarithm.`
			`add : float, optional`
			`Offset.`

			`Returns`
			`-------`
			`float`
			`Inverse document frequency.`

			`"""`
			`return add + np.log(float(totaldocs) / docfreq) / np.log(log_base)`


			`def precompute_idfs(wglobal, dfs, total_docs):`
			`"""Pre-compute the inverse document frequency mapping for all terms.`

			`Parameters`
			`----------`
			`wglobal : function`
			`Custom function for calculating the "global" weighting function.`
			See for example the SMART alternatives under :func:`~gensim.models.tfidfmodel.smartirs_wglobal`.
			`dfs : dict`
			Dictionary mapping `term_id` into how many documents did that term appear in.
			`total_docs : int`
			`Total number of documents.`

			`Returns`
			`-------`
			`dict of (int, float)`
			Inverse document frequencies in the format `{term_id_1: idfs_1, term_id_2: idfs_2, ...}`.

			`"""`
			`# not strictly necessary and could be computed on the fly in TfidfModel__getitem__.`
			`# this method is here just to speed things up a little.`
			`return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)}`


			`def smartirs_wlocal(tf, local_scheme):`
			"""Calculate local term weight for a term using the weighting scheme specified in `local_scheme`.

			`Parameters`
			`----------`
			`tf : int`
			`Term frequency.`
			`local : {'n', 'l', 'a', 'b', 'L'}`
			`Local transformation scheme.`

			`Returns`
			`-------`
			`float`
			`Calculated local weight.`

			`"""`
			`if local_scheme == "n":`
			`return tf`
			`elif local_scheme == "l":`
			`return 1 + np.log2(tf)`
			`elif local_scheme == "a":`
			`return 0.5 + (0.5 * tf / tf.max(axis=0))`
			`elif local_scheme == "b":`
			`return tf.astype('bool').astype('int')`
			`elif local_scheme == "L":`
			`return (1 + np.log2(tf)) / (1 + np.log2(tf.mean(axis=0)))`


			`def smartirs_wglobal(docfreq, totaldocs, global_scheme):`
			"""Calculate global document weight based on the weighting scheme specified in `global_scheme`.

			`Parameters`
			`----------`
			`docfreq : int`
			`Document frequency.`
			`totaldocs : int`
			`Total number of documents.`
			`global_scheme : {'n', 't', 'p'}`
			`Global transformation scheme.`

			`Returns`
			`-------`
			`float`
			`Calculated global weight.`

			`"""`

			`if global_scheme == "n":`
			`return 1.`
			`elif global_scheme == "t":`
			`return np.log2(1.0 * totaldocs / docfreq)`
			`elif global_scheme == "p":`
			`return max(0, np.log2((1.0 * totaldocs - docfreq) / docfreq))`


			`def smartirs_normalize(x, norm_scheme, return_norm=False):`
			"""Normalize a vector using the normalization scheme specified in `norm_scheme`.

			`Parameters`
			`----------`
			`x : numpy.ndarray`
			`Input array`
			`norm_scheme : {'n', 'c'}`
			`Normalizing function to use:`
			`n`: no normalization
			`c`: unit L2 norm (scale `x` to unit euclidean length)
			`return_norm : bool, optional`
			Return the length of `x` as well?

			`Returns`
			`-------`
			`numpy.ndarray`
			`Normalized array.`
			`float (only if return_norm is set)`
			L2 norm of `x`.

			`"""`
			`if norm_scheme == "n":`
			`if return_norm:`
			`_, length = matutils.unitvec(x, return_norm=return_norm)`
			`return x, length`
			`else:`
			`return x`
			`elif norm_scheme == "c":`
			`return matutils.unitvec(x, return_norm=return_norm)`


			`class TfidfModel(interfaces.TransformationABC):`
			`"""Objects of this class realize the transformation between word-document co-occurrence matrix (int)`
			`into a locally/globally weighted TF-IDF matrix (positive floats).`

			`Examples`
			`--------`
			`>>> import gensim.downloader as api`
			`>>> from gensim.models import TfidfModel`
			`>>> from gensim.corpora import Dictionary`
			`>>>`
			`>>> dataset = api.load("text8")`
			`>>> dct = Dictionary(dataset) # fit dictionary`
			`>>> corpus = [dct.doc2bow(line) for line in dataset] # convert corpus to BoW format`
			`>>>`
			`>>> model = TfidfModel(corpus) # fit model`
			`>>> vector = model[corpus[0]] # apply model to the first corpus document`

			`"""`
			`def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,`
			`wglobal=df2idf, normalize=True, smartirs=None, pivot=None, slope=0.65):`
			`"""Compute TF-IDF by multiplying a local component (term frequency) with a global component`
			`(inverse document frequency), and normalizing the resulting documents to unit length.`
			Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents

			`.. math:: weight_{i,j} = frequency_{i,j} * log_2 \\frac{D}{document\_freq_{i}}`

			`or, more generally`

			`.. math:: weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document\_freq_{i}, D)`

			so you can plug in your own custom :math:`wlocal` and :math:`wglobal` functions.

			`Parameters`
			`----------`
			`corpus : iterable of iterable of (int, int), optional`
			`Input corpus`
			id2word : {dict, :class:`~gensim.corpora.Dictionary`}, optional
			`Mapping token - id, that was used for converting input data to bag of words format.`
			dictionary : :class:`~gensim.corpora.Dictionary`
			If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used.
			to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored).
			`wlocals : function, optional`
			Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity`
			(other options: :func:`math.sqrt`, :func:`math.log1p`, etc).
			`wglobal : function, optional`
			Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`.
			`normalize : bool, optional`
			Normalize document vectors to unit euclidean length? You can also inject your own function into `normalize`.
			`smartirs : str, optional`
			`SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System,`
			`a mnemonic scheme for denoting tf-idf weighting variants in the vector space model.`
			`The mnemonic for representing a combination of weights takes the form XYZ,`
			`for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector.`

			`Term frequency weighing:`
			* `n` - natural,
			* `l` - logarithm,
			* `a` - augmented,
			* `b` - boolean,
			* `L` - log average.

			`Document frequency weighting:`
			* `n` - none,
			* `t` - idf,
			* `p` - prob idf.

			`Document normalization:`
			* `n` - none,
			* `c` - cosine.

			For more information visit `SMART Information Retrieval System
			<https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
			`pivot : float, optional`
			`See the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/.`

			Pivot is the point around which the regular normalization curve is `tilted` to get the new pivoted
			normalization curve. In the paper `Amit Singhal, Chris Buckley, Mandar Mitra:
			"Pivoted Document Length Normalization" <http://singhal.info/pivoted-dln.pdf>`_ it is the point where the
			`retrieval and relevance curves intersect.`

			This parameter along with `slope` is used for pivoted document length normalization.
			Only when `pivot` is not None will pivoted document length normalization be applied.
			`Otherwise, regular TfIdf is used.`
			`slope : float, optional`
			`Parameter required by pivoted document length normalization which determines the slope to which`
			the `old normalization` can be tilted. This parameter only works when pivot is defined.

			`"""`
			`self.id2word = id2word`
			`self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize`
			`self.num_docs, self.num_nnz, self.idfs = None, None, None`
			`self.smartirs = smartirs`
			`self.slope = slope`
			`self.pivot = pivot`
			`self.eps = 1e-12`

			`# If smartirs is not None, override wlocal, wglobal and normalize`
			`if smartirs is not None:`
			`n_tf, n_df, n_n = resolve_weights(smartirs)`
			`self.wlocal = partial(smartirs_wlocal, local_scheme=n_tf)`
			`self.wglobal = partial(smartirs_wglobal, global_scheme=n_df)`
			`# also return norm factor if pivot is not none`
			`if self.pivot is None:`
			`self.normalize = partial(smartirs_normalize, norm_scheme=n_n)`
			`else:`
			`self.normalize = partial(smartirs_normalize, norm_scheme=n_n, return_norm=True)`

			`if dictionary is not None:`
			`# user supplied a Dictionary object, which already contains all the`
			`# statistics we need to construct the IDF mapping. we can skip the`
			`# step that goes through the corpus (= an optimization).`
			`if corpus is not None:`
			`logger.warning(`
			`"constructor received both corpus and explicit inverse document frequencies; ignoring the corpus"`
			`)`
			`self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz`
			`self.dfs = dictionary.dfs.copy()`
			`self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)`
			`if id2word is None:`
			`self.id2word = dictionary`
			`elif corpus is not None:`
			`self.initialize(corpus)`
			`else:`
			`# NOTE: everything is left uninitialized; presumably the model will`
			`# be initialized in some other way`
			`pass`

			`@classmethod`
			`def load(cls, args, *kwargs):`
			`"""Load a previously saved TfidfModel class. Handles backwards compatibility from`
			`older TfidfModel versions which did not use pivoted document normalization.`

			`"""`
			`model = super(TfidfModel, cls).load(args, *kwargs)`
			`if not hasattr(model, 'pivot'):`
			`model.pivot = None`
			`logger.info('older version of %s loaded without pivot arg', cls.__name__)`
			`logger.info('Setting pivot to %s.', model.pivot)`
			`if not hasattr(model, 'slope'):`
			`model.slope = 0.65`
			`logger.info('older version of %s loaded without slope arg', cls.__name__)`
			`logger.info('Setting slope to %s.', model.slope)`
			`return model`

			`def __str__(self):`
			`return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz)`

			`def initialize(self, corpus):`
			`"""Compute inverse document weights, which will be used to modify term frequencies for documents.`

			`Parameters`
			`----------`
			`corpus : iterable of iterable of (int, int)`
			`Input corpus.`

			`"""`
			`logger.info("collecting document frequencies")`
			`dfs = {}`
			`numnnz, docno = 0, -1`

			`for docno, bow in enumerate(corpus):`
			`if docno % 10000 == 0:`
			`logger.info("PROGRESS: processing document #%i", docno)`
			`numnnz += len(bow)`
			`for termid, _ in bow:`
			`dfs[termid] = dfs.get(termid, 0) + 1`
			`# keep some stats about the training corpus`
			`self.num_docs = docno + 1`
			`self.num_nnz = numnnz`
			`self.dfs = dfs`
			`# and finally compute the idf weights`
			`n_features = max(dfs) if dfs else 0`
			`logger.info(`
			`"calculating IDF weights for %i documents and %i features (%i matrix non-zeros)",`
			`self.num_docs, n_features, self.num_nnz`
			`)`
			`self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)`

			`def __getitem__(self, bow, eps=1e-12):`
			`"""Get the tf-idf representation of an input vector and/or corpus.`

			`bow : {list of (int, int), iterable of iterable of (int, int)}`
			Input document in the `sparse Gensim bag-of-words format
			<https://radimrehurek.com/gensim/intro.html#core-concepts>`_,
			`or a streamed corpus of such documents.`
			`eps : float`
			Threshold value, will remove all position that have tfidf-value less than `eps`.

			`Returns`
			`-------`
			`vector : list of (int, float)`
			TfIdf vector, if `bow` is a single document
			:class:`~gensim.interfaces.TransformedCorpus`
			TfIdf corpus, if `bow` is a corpus.

			`"""`
			`self.eps = eps`
			`# if the input vector is in fact a corpus, return a transformed corpus as a result`
			`is_corpus, bow = utils.is_corpus(bow)`
			`if is_corpus:`
			`return self._apply(bow)`

			`# unknown (new) terms will be given zero weight (NOT infinity/huge weight,`
			`# as strict application of the IDF formula would dictate)`

			`termid_array, tf_array = [], []`
			`for termid, tf in bow:`
			`termid_array.append(termid)`
			`tf_array.append(tf)`

			`tf_array = self.wlocal(np.array(tf_array))`

			`vector = [`
			`(termid, tf * self.idfs.get(termid))`
			`for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > self.eps`
			`]`

			`if self.normalize is True:`
			`self.normalize = matutils.unitvec`
			`elif self.normalize is False:`
			`self.normalize = utils.identity`

			`# and finally, normalize the vector either to unit length, or use a`
			`# user-defined normalization function`
			`if self.pivot is None:`
			`norm_vector = self.normalize(vector)`
			`norm_vector = [(termid, weight) for termid, weight in norm_vector if abs(weight) > self.eps]`
			`else:`
			`_, old_norm = self.normalize(vector, return_norm=True)`
			`pivoted_norm = (1 - self.slope) * self.pivot + self.slope * old_norm`
			`norm_vector = [`
			`(termid, weight / float(pivoted_norm))`
			`for termid, weight in vector`
			`if abs(weight / float(pivoted_norm)) > self.eps`
			`]`
			`return norm_vector`