laywerrobot/lib/python3.6/site-packages/gensim/topic_coherence/direct_confirmation_measure.py

202 lines
7.8 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""This module contains functions to compute direct confirmation on a pair of words or word subsets."""
import logging
import numpy as np
logger = logging.getLogger(__name__)
# Should be small. Value as suggested in paper http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf
EPSILON = 1e-12
def log_conditional_probability(segmented_topics, accumulator, with_std=False, with_support=False):
"""Calculate the log-conditional-probability measure which is used by coherence measures such as `U_mass`.
This is defined as :math:`m_{lc}(S_i) = log \\frac{P(W', W^{*}) + \epsilon}{P(W^{*})}`.
Parameters
----------
segmented_topics : list of lists of (int, int)
Output from the :func:`~gensim.topic_coherence.segmentation.s_one_pre`,
:func:`~gensim.topic_coherence.segmentation.s_one_one`.
accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
Word occurrence accumulator from :mod:`gensim.topic_coherence.probability_estimation`.
with_std : bool, optional
True to also include standard deviation across topic segment sets in addition to the mean coherence
for each topic.
with_support : bool, optional
True to also include support across topic segments. The support is defined as the number of pairwise
similarity comparisons were used to compute the overall topic coherence.
Returns
-------
list of float
Log conditional probabilities measurement for each topic.
Examples
--------
>>> from gensim.topic_coherence import direct_confirmation_measure, text_analysis
>>> from collections import namedtuple
>>>
>>> # Create dictionary
>>> id2token = {1: 'test', 2: 'doc'}
>>> token2id = {v: k for k, v in id2token.items()}
>>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token)
>>>
>>> # Initialize segmented topics and accumulator
>>> segmentation = [[(1, 2)]]
>>>
>>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
>>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
>>> accumulator._num_docs = 5
>>>
>>> # result should be ~ ln(1 / 2) = -0.693147181
>>> result = direct_confirmation_measure.log_conditional_probability(segmentation, accumulator)[0]
"""
topic_coherences = []
num_docs = float(accumulator.num_docs)
for s_i in segmented_topics:
segment_sims = []
for w_prime, w_star in s_i:
try:
w_star_count = accumulator[w_star]
co_occur_count = accumulator[w_prime, w_star]
m_lc_i = np.log(((co_occur_count / num_docs) + EPSILON) / (w_star_count / num_docs))
except KeyError:
m_lc_i = 0.0
segment_sims.append(m_lc_i)
topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))
return topic_coherences
def aggregate_segment_sims(segment_sims, with_std, with_support):
"""Compute various statistics from the segment similarities generated via set pairwise comparisons
of top-N word lists for a single topic.
Parameters
----------
segment_sims : iterable of float
Similarity values to aggregate.
with_std : bool
Set to True to include standard deviation.
with_support : bool
Set to True to include number of elements in `segment_sims` as a statistic in the results returned.
Returns
-------
(float[, float[, int]])
Tuple with (mean[, std[, support]]).
Examples
---------
>>> from gensim.topic_coherence import direct_confirmation_measure
>>>
>>> segment_sims = [0.2, 0.5, 1., 0.05]
>>> direct_confirmation_measure.aggregate_segment_sims(segment_sims, True, True)
(0.4375, 0.36293077852394939, 4)
>>> direct_confirmation_measure.aggregate_segment_sims(segment_sims, False, False)
0.4375
"""
mean = np.mean(segment_sims)
stats = [mean]
if with_std:
stats.append(np.std(segment_sims))
if with_support:
stats.append(len(segment_sims))
return stats[0] if len(stats) == 1 else tuple(stats)
def log_ratio_measure(segmented_topics, accumulator, normalize=False, with_std=False, with_support=False):
"""Compute log ratio measure for `segment_topics`.
Parameters
----------
segmented_topics : list of lists of (int, int)
Output from the :func:`~gensim.topic_coherence.segmentation.s_one_pre`,
:func:`~gensim.topic_coherence.segmentation.s_one_one`.
accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
Word occurrence accumulator from :mod:`gensim.topic_coherence.probability_estimation`.
normalize : bool, optional
Details in the "Notes" section.
with_std : bool, optional
True to also include standard deviation across topic segment sets in addition to the mean coherence
for each topic.
with_support : bool, optional
True to also include support across topic segments. The support is defined as the number of pairwise
similarity comparisons were used to compute the overall topic coherence.
Notes
-----
If `normalize=False`:
Calculate the log-ratio-measure, popularly known as **PMI** which is used by coherence measures such as `c_v`.
This is defined as :math:`m_{lr}(S_i) = log \\frac{P(W', W^{*}) + \epsilon}{P(W') * P(W^{*})}`
If `normalize=True`:
Calculate the normalized-log-ratio-measure, popularly knowns as **NPMI**
which is used by coherence measures such as `c_v`.
This is defined as :math:`m_{nlr}(S_i) = \\frac{m_{lr}(S_i)}{-log(P(W', W^{*}) + \epsilon)}`
Returns
-------
list of float
Log ratio measurements for each topic.
Examples
--------
>>> from gensim.topic_coherence import direct_confirmation_measure, text_analysis
>>> from collections import namedtuple
>>>
>>> # Create dictionary
>>> id2token = {1: 'test', 2: 'doc'}
>>> token2id = {v: k for k, v in id2token.items()}
>>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token)
>>>
>>> # Initialize segmented topics and accumulator
>>> segmentation = [[(1, 2)]]
>>>
>>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
>>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
>>> accumulator._num_docs = 5
>>>
>>> # result should be ~ ln{(1 / 5) / [(3 / 5) * (2 / 5)]} = -0.182321557
>>> result = direct_confirmation_measure.log_ratio_measure(segmentation, accumulator)[0]
"""
topic_coherences = []
num_docs = float(accumulator.num_docs)
for s_i in segmented_topics:
segment_sims = []
for w_prime, w_star in s_i:
w_prime_count = accumulator[w_prime]
w_star_count = accumulator[w_star]
co_occur_count = accumulator[w_prime, w_star]
if normalize:
# For normalized log ratio measure
numerator = log_ratio_measure([[(w_prime, w_star)]], accumulator)[0]
co_doc_prob = co_occur_count / num_docs
m_lr_i = numerator / (-np.log(co_doc_prob + EPSILON))
else:
# For log ratio measure without normalization
numerator = (co_occur_count / num_docs) + EPSILON
denominator = (w_prime_count / num_docs) * (w_star_count / num_docs)
m_lr_i = np.log(numerator / denominator)
segment_sims.append(m_lr_i)
topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))
return topic_coherences