laywerrobot/lib/python3.6/site-packages/gensim/summarization/mz_entropy.py

120 lines
4.1 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
from gensim.summarization.textcleaner import tokenize_by_word as _tokenize_by_word
from gensim.utils import to_unicode
import numpy
import scipy
def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, threshold=0.0):
"""Extract keywords from text using the Montemurro and Zanette entropy algorithm. [1]_
Parameters
----------
text: str
Document for summarization.
blocksize: int, optional
Size of blocks to use in analysis.
scores: bool, optional
Whether to return score with keywords.
split: bool, optional
Whether to return results as list.
weighted: bool, optional
Whether to weight scores by word frequency.
False can useful for shorter texts, and allows automatic thresholding.
threshold: float or 'auto', optional
Minimum score for returned keywords, 'auto' calculates the threshold as n_blocks / (n_blocks + 1.0) + 1e-8,
use 'auto' with `weighted=False`.
Returns
-------
results: str
newline separated keywords if `split` == False **OR**
results: list(str)
list of keywords if `scores` == False **OR**
results: list(tuple(str, float))
list of (keyword, score) tuples if `scores` == True
Results are returned in descending order of score regardless of the format.
Note
----
This algorithm looks for keywords that contribute to the structure of the
text on scales of `blocksize` words of larger. It is suitable for extracting
keywords representing the major themes of long texts.
References
----------
.. [1] Marcello A Montemurro, Damian Zanette, "Towards the quantification of the semantic information encoded in
written language". Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153,
DOI: 10.1142/S0219525910002530, https://arxiv.org/abs/0907.1558
"""
text = to_unicode(text)
words = [word for word in _tokenize_by_word(text)]
vocab = sorted(set(words))
word_counts = numpy.array(
[
[words[i:i + blocksize].count(word) for word in vocab]
for i in range(0, len(words), blocksize)
]
).astype('d')
n_blocks = word_counts.shape[0]
totals = word_counts.sum(axis=0)
n_words = totals.sum()
p = word_counts / totals
log_p = numpy.log2(p)
h = numpy.nan_to_num(p * log_p).sum(axis=0)
analytic = __analytic_entropy(blocksize, n_blocks, n_words)
h += analytic(totals).astype('d')
if weighted:
h *= totals / n_words
if threshold == 'auto':
threshold = n_blocks / (n_blocks + 1.0) + 1.0e-8
weights = [(word, score) for (word, score) in zip(vocab, h) if score > threshold]
weights.sort(key=lambda x: -x[1])
result = weights if scores else [word for (word, score) in weights]
if not (scores or split):
result = '\n'.join(result)
return result
def __log_combinations_inner(n, m):
"""Calculates the logarithm of n!/m!(n-m)!"""
return -(numpy.log(n + 1) + scipy.special.betaln(n - m + 1, m + 1))
__log_combinations = numpy.frompyfunc(__log_combinations_inner, 2, 1)
def __marginal_prob(blocksize, n_words):
def marginal_prob(n, m):
"""Marginal probability of a word that occurs n times in the document
occurring m times in a given block"""
return numpy.exp(
__log_combinations(n, m) +
__log_combinations(n_words - n, blocksize - m) -
__log_combinations(n_words, blocksize)
)
return numpy.frompyfunc(marginal_prob, 2, 1)
def __analytic_entropy(blocksize, n_blocks, n_words):
marginal = __marginal_prob(blocksize, n_words)
def analytic_entropy(n):
"""Predicted entropy for a word that occurs n times in the document"""
m = numpy.arange(1, min(blocksize, n) + 1).astype('d')
p = m / n
elements = numpy.nan_to_num(p * numpy.log2(p)) * marginal(n, m)
return -n_blocks * elements.sum()
return numpy.frompyfunc(analytic_entropy, 1, 1)