laywerrobot/lib/python3.6/site-packages/gensim/topic_coherence/indirect_confirmation_measure.py

338 lines
12 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
r"""This module contains functions to compute confirmation on a pair of words or word subsets.
Notes
-----
The advantage of indirect confirmation measure is that it computes similarity of words in :math:`W'` and
:math:`W^{*}` with respect to direct confirmations to all words. Eg. Suppose `x` and `z` are both competing
brands of cars, which semantically support each other. However, both brands are seldom mentioned
together in documents in the reference corpus. But their confirmations to other words like road
or speed do strongly correlate. This would be reflected by an indirect confirmation measure.
Thus, indirect confirmation measures may capture semantic support that direct measures would miss.
The formula used to compute indirect confirmation measure is
.. math::
\widetilde{m}_{sim(m, \gamma)}(W', W^{*}) = s_{sim}(\vec{v}^{\,}_{m,\gamma}(W'), \vec{v}^{\,}_{m,\gamma}(W^{*}))
where :math:`s_{sim}` can be cosine, dice or jaccard similarity and
.. math::
\vec{v}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|}
"""
import itertools
import logging
import numpy as np
import scipy.sparse as sps
from gensim.topic_coherence.direct_confirmation_measure import aggregate_segment_sims, log_ratio_measure
logger = logging.getLogger(__name__)
def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_support=False):
"""For each topic segmentation, compute average cosine similarity using a
:class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator`.
Parameters
----------
segmented_topics : list of lists of (int, `numpy.ndarray`)
Output from the :func:`~gensim.topic_coherence.segmentation.s_one_set`.
accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or
:class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
Word occurrence accumulator.
with_std : bool, optional
True to also include standard deviation across topic segment sets
in addition to the mean coherence for each topic.
with_support : bool, optional
True to also include support across topic segments. The support is defined as
the number of pairwise similarity comparisons were used to compute the overall topic coherence.
Returns
-------
list of (float[, float[, int]])
Сosine word2vec similarities per topic (with std/support if `with_std`, `with_support`).
Examples
--------
>>> import numpy as np
>>> from gensim.corpora.dictionary import Dictionary
>>> from gensim.topic_coherence import indirect_confirmation_measure
>>> from gensim.topic_coherence import text_analysis
>>>
>>> # create segmentation
>>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
>>>
>>> # create accumulator
>>> dictionary = Dictionary()
>>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
>>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary)
>>> _ = accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5)
>>>
>>> # should be (0.726752426218 0.00695475919227)
>>> mean, std = indirect_confirmation_measure.word2vec_similarity(segmentation, accumulator, with_std=True)[0]
"""
topic_coherences = []
total_oov = 0
for topic_index, topic_segments in enumerate(segmented_topics):
segment_sims = []
num_oov = 0
for w_prime, w_star in topic_segments:
if not hasattr(w_prime, '__iter__'):
w_prime = [w_prime]
if not hasattr(w_star, '__iter__'):
w_star = [w_star]
try:
segment_sims.append(accumulator.ids_similarity(w_prime, w_star))
except ZeroDivisionError:
num_oov += 1
if num_oov > 0:
total_oov += 1
logger.warning(
"%d terms for topic %d are not in word2vec model vocabulary",
num_oov, topic_index)
topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))
if total_oov > 0:
logger.warning("%d terms for are not in word2vec model vocabulary", total_oov)
return topic_coherences
def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr',
gamma=1, with_std=False, with_support=False):
"""Calculate the indirect cosine measure.
Parameters
----------
segmented_topics: list of lists of (int, `numpy.ndarray`)
Output from the segmentation module of the segmented topics.
accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
Output from the probability_estimation module. Is an topics: Topics obtained from the trained topic model.
measure : str, optional
Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
gamma: float, optional
Gamma value for computing :math:`W'` and :math:`W^{*}` vectors.
with_std : bool
True to also include standard deviation across topic segment sets in addition to the mean coherence
for each topic; default is False.
with_support : bool
True to also include support across topic segments. The support is defined as the number of pairwise similarity
comparisons were used to compute the overall topic coherence.
Returns
-------
list
List of indirect cosine similarity measure for each topic.
Examples
--------
>>> from gensim.corpora.dictionary import Dictionary
>>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis
>>> import numpy as np
>>>
>>> # create accumulator
>>> dictionary = Dictionary()
>>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
>>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
>>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
>>> accumulator._num_docs = 5
>>>
>>> # create topics
>>> topics = [np.array([1, 2])]
>>>
>>> # create segmentation
>>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
>>> obtained = indirect_confirmation_measure.cosine_similarity(segmentation, accumulator, topics, 'nlr', 1)
>>> print obtained[0]
0.623018926945
"""
context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma)
topic_coherences = []
for topic_words, topic_segments in zip(topics, segmented_topics):
topic_words = tuple(topic_words) # because tuples are hashable
segment_sims = np.zeros(len(topic_segments))
for i, (w_prime, w_star) in enumerate(topic_segments):
w_prime_cv = context_vectors[w_prime, topic_words]
w_star_cv = context_vectors[w_star, topic_words]
segment_sims[i] = _cossim(w_prime_cv, w_star_cv)
topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))
return topic_coherences
class ContextVectorComputer(object):
"""Lazily compute context vectors for topic segments.
Parameters
----------
measure: str
Confirmation measure.
topics: list of numpy.array
Topics.
accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or
:class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
Word occurrence accumulator from probability_estimation.
gamma: float
Value for computing vectors.
Attributes
----------
sim_cache: dict
Cache similarities between tokens (pairs of word ids), e.g. (1, 2).
context_vector_cache: dict
Mapping from (segment, topic_words) --> context_vector.
Example
-------
>>> from gensim.corpora.dictionary import Dictionary
>>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis
>>> import numpy as np
>>>
>>> # create measure, topics
>>> measure = 'nlr'
>>> topics = [np.array([1, 2])]
>>>
>>> # create accumulator
>>> dictionary = Dictionary()
>>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
>>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary)
>>> _ = accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5)
>>> cont_vect_comp = indirect_confirmation_measure.ContextVectorComputer(measure, topics, accumulator, 1)
>>> cont_vect_comp.mapping
{1: 0, 2: 1}
>>> cont_vect_comp.vocab_size
2
"""
def __init__(self, measure, topics, accumulator, gamma):
if measure == 'nlr':
self.similarity = _pair_npmi
else:
raise ValueError(
"The direct confirmation measure you entered is not currently supported.")
self.mapping = _map_to_contiguous(topics)
self.vocab_size = len(self.mapping)
self.accumulator = accumulator
self.gamma = gamma
self.sim_cache = {}
self.context_vector_cache = {}
def __getitem__(self, idx):
return self.compute_context_vector(*idx)
def compute_context_vector(self, segment_word_ids, topic_word_ids):
"""Check if (segment_word_ids, topic_word_ids) context vector has been cached.
Parameters
----------
segment_word_ids: list
Ids of words in segment.
topic_word_ids: list
Ids of words in topic.
Returns
-------
csr_matrix :class:`~scipy.sparse.csr`
If context vector has been cached, then return corresponding context vector,
else compute, cache, and return.
"""
key = _key_for_segment(segment_word_ids, topic_word_ids)
context_vector = self.context_vector_cache.get(key, None)
if context_vector is None:
context_vector = self._make_seg(segment_word_ids, topic_word_ids)
self.context_vector_cache[key] = context_vector
return context_vector
def _make_seg(self, segment_word_ids, topic_word_ids):
"""Return context vectors for segmentation (Internal helper function).
Parameters
----------
segment_word_ids : iterable or int
Ids of words in segment.
topic_word_ids : list
Ids of words in topic.
Returns
-------
csr_matrix :class:`~scipy.sparse.csr`
Matrix in Compressed Sparse Row format
"""
context_vector = sps.lil_matrix((self.vocab_size, 1))
if not hasattr(segment_word_ids, '__iter__'):
segment_word_ids = (segment_word_ids,)
for w_j in topic_word_ids:
idx = (self.mapping[w_j], 0)
for pair in (tuple(sorted((w_i, w_j))) for w_i in segment_word_ids):
if pair not in self.sim_cache:
self.sim_cache[pair] = self.similarity(pair, self.accumulator)
context_vector[idx] += self.sim_cache[pair] ** self.gamma
return context_vector.tocsr()
def _pair_npmi(pair, accumulator):
"""Compute normalized pairwise mutual information (**NPMI**) between a pair of words.
Parameters
----------
pair : (int, int)
The pair of words (word_id1, word_id2).
accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
Word occurrence accumulator from probability_estimation.
Return
------
float
NPMI between a pair of words.
"""
return log_ratio_measure([[pair]], accumulator, True)[0]
def _cossim(cv1, cv2):
return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))
def _magnitude(sparse_vec):
return np.sqrt(np.sum(sparse_vec.data ** 2))
def _map_to_contiguous(ids_iterable):
uniq_ids = {}
n = 0
for id_ in itertools.chain.from_iterable(ids_iterable):
if id_ not in uniq_ids:
uniq_ids[id_] = n
n += 1
return uniq_ids
def _key_for_segment(segment, topic_words):
"""A segment may have a single number of an iterable of them."""
segment_key = tuple(segment) if hasattr(segment, '__iter__') else segment
return segment_key, topic_words