147 lines
5 KiB
Python
147 lines
5 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
|
|
|
"""This module allows simple Bag of Words (BoW) represented corpus to be transformed into log entropy space.
|
|
It implements Log Entropy Model that produces entropy-weighted logarithmic term frequency representation.
|
|
|
|
Empirical study by Lee et al. 2015 [1]_ suggests log entropy-weighted model yields better results among other forms of
|
|
representation.
|
|
|
|
References
|
|
----------
|
|
.. [1] Lee et al. 2005. An Empirical Evaluation of Models of Text Document Similarity.
|
|
https://escholarship.org/uc/item/48g155nq
|
|
|
|
"""
|
|
|
|
import logging
|
|
import math
|
|
|
|
from gensim import interfaces, matutils, utils
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class LogEntropyModel(interfaces.TransformationABC):
|
|
"""Objects of this class realize the transformation between word-document co-occurrence matrix (int)
|
|
into a locally/globally weighted matrix (positive floats).
|
|
|
|
This is done by a log entropy normalization, optionally normalizing the resulting documents to unit length.
|
|
The following formulas explain how o compute the log entropy weight for term :math:`i` in document :math:`j`:
|
|
|
|
.. math::
|
|
|
|
local\_weight_{i,j} = log(frequency_{i,j} + 1)
|
|
|
|
P_{i,j} = \\frac{frequency_{i,j}}{\sum_j frequency_{i,j}}
|
|
|
|
global\_weight_i = 1 + \\frac{\sum_j P_{i,j} * log(P_{i,j})}{log(number\_of\_documents + 1)}
|
|
|
|
final\_weight_{i,j} = local\_weight_{i,j} * global\_weight_i
|
|
|
|
Examples
|
|
--------
|
|
>>> from gensim.models import LogEntropyModel
|
|
>>> from gensim.test.utils import common_texts
|
|
>>> from gensim.corpora import Dictionary
|
|
>>>
|
|
>>> dct = Dictionary(common_texts) # fit dictionary
|
|
>>> corpus = [dct.doc2bow(row) for row in common_texts] # convert to BoW format
|
|
>>> model = LogEntropyModel(corpus) # fit model
|
|
>>> vector = model[corpus[1]] # apply model to document
|
|
|
|
"""
|
|
|
|
def __init__(self, corpus, normalize=True):
|
|
"""
|
|
|
|
Parameters
|
|
----------
|
|
corpus : iterable of iterable of (int, int)
|
|
Input corpus in BoW format.
|
|
normalize : bool, optional
|
|
If True, the resulted log entropy weighted vector will be normalized to length of 1,
|
|
If False - do nothing.
|
|
|
|
"""
|
|
self.normalize = normalize
|
|
self.n_docs = 0
|
|
self.n_words = 0
|
|
self.entr = {}
|
|
if corpus is not None:
|
|
self.initialize(corpus)
|
|
|
|
def __str__(self):
|
|
return "LogEntropyModel(n_docs=%s, n_words=%s)" % (self.n_docs, self.n_words)
|
|
|
|
def initialize(self, corpus):
|
|
"""Calculates the global weighting for all terms in a given corpus and transforms the simple
|
|
count representation into the log entropy normalized space.
|
|
|
|
Parameters
|
|
----------
|
|
corpus : iterable of iterable of (int, int)
|
|
Corpus is BoW format
|
|
|
|
"""
|
|
logger.info("calculating counts")
|
|
glob_freq = {}
|
|
glob_num_words, doc_no = 0, -1
|
|
for doc_no, bow in enumerate(corpus):
|
|
if doc_no % 10000 == 0:
|
|
logger.info("PROGRESS: processing document #%i", doc_no)
|
|
glob_num_words += len(bow)
|
|
for term_id, term_count in bow:
|
|
glob_freq[term_id] = glob_freq.get(term_id, 0) + term_count
|
|
|
|
# keep some stats about the training corpus
|
|
self.n_docs = doc_no + 1
|
|
self.n_words = glob_num_words
|
|
|
|
# and finally compute the global weights
|
|
logger.info(
|
|
"calculating global log entropy weights for %i documents and %i features (%i matrix non-zeros)",
|
|
self.n_docs, len(glob_freq), self.n_words
|
|
)
|
|
logger.debug('iterating over corpus')
|
|
for doc_no2, bow in enumerate(corpus):
|
|
for key, freq in bow:
|
|
p = (float(freq) / glob_freq[key]) * math.log(float(freq) / glob_freq[key])
|
|
self.entr[key] = self.entr.get(key, 0.0) + p
|
|
if doc_no2 != doc_no:
|
|
raise ValueError("LogEntropyModel doesn't support generators as training data")
|
|
|
|
logger.debug('iterating over keys')
|
|
for key in self.entr:
|
|
self.entr[key] = 1 + self.entr[key] / math.log(self.n_docs + 1)
|
|
|
|
def __getitem__(self, bow):
|
|
"""Get log entropy representation of the input vector and/or corpus.
|
|
|
|
Parameters
|
|
----------
|
|
bow : list of (int, int)
|
|
Document in BoW format.
|
|
|
|
Returns
|
|
-------
|
|
list of (int, float)
|
|
Log-entropy vector for passed `bow`.
|
|
|
|
"""
|
|
# if the input vector is in fact a corpus, return a transformed corpus
|
|
is_corpus, bow = utils.is_corpus(bow)
|
|
if is_corpus:
|
|
return self._apply(bow)
|
|
|
|
# unknown (new) terms will be given zero weight (NOT infinity/huge)
|
|
vector = [
|
|
(term_id, math.log(tf + 1) * self.entr.get(term_id))
|
|
for term_id, tf in bow
|
|
if term_id in self.entr
|
|
]
|
|
if self.normalize:
|
|
vector = matutils.unitvec(vector)
|
|
return vector
|