laywerrobot/lib/python3.6/site-packages/gensim/models/normmodel.py

100 lines
2.6 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
import logging
from gensim import interfaces, matutils
logger = logging.getLogger(__name__)
class NormModel(interfaces.TransformationABC):
"""Objects of this class realize the explicit normalization of vectors (l1 and l2)."""
def __init__(self, corpus=None, norm='l2'):
"""Compute the l1 or l2 normalization by normalizing separately for each document in a corpus.
If :math:`v_{i,j}` is the 'i'th component of the vector representing document 'j', the l1 normalization is
.. math:: l1_{i, j} = \\frac{v_{i,j}}{\sum_k |v_{k,j}|}
the l2 normalization is
.. math:: l2_{i, j} = \\frac{v_{i,j}}{\sqrt{\sum_k v_{k,j}^2}}
Parameters
----------
corpus : iterable of iterable of (int, number), optional
Input corpus.
norm : {'l1', 'l2'}, optional
Norm used to normalize.
"""
self.norm = norm
if corpus is not None:
self.calc_norm(corpus)
else:
pass
def __str__(self):
return "NormModel(num_docs=%s, num_nnz=%s, norm=%s)" % (self.num_docs, self.num_nnz, self.norm)
def calc_norm(self, corpus):
"""Calculate the norm by calling :func:`~gensim.matutils.unitvec` with the norm parameter.
Parameters
----------
corpus : iterable of iterable of (int, number)
Input corpus.
"""
logger.info("Performing %s normalization...", self.norm)
norms = []
numnnz = 0
docno = 0
for bow in corpus:
docno += 1
numnnz += len(bow)
norms.append(matutils.unitvec(bow, self.norm))
self.num_docs = docno
self.num_nnz = numnnz
self.norms = norms
def normalize(self, bow):
"""Normalize a simple count representation.
Parameters
----------
bow : list of (int, number)
Document in BoW format.
Returns
-------
list of (int, number)
Normalized document.
"""
vector = matutils.unitvec(bow, self.norm)
return vector
def __getitem__(self, bow):
"""Call the :func:`~gensim.models.normmodel.NormModel.normalize`.
Parameters
----------
bow : list of (int, number)
Document in BoW format.
Returns
-------
list of (int, number)
Normalized document.
"""
return self.normalize(bow)