337 lines
12 KiB
Python
337 lines
12 KiB
Python
#!/usr/bin/env python
|
||
# -*- coding: utf-8 -*-
|
||
#
|
||
# Copyright (C) 2013 Radim Rehurek <radimrehurek@seznam.cz>
|
||
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
||
|
||
r"""This module contains functions to compute confirmation on a pair of words or word subsets.
|
||
|
||
Notes
|
||
-----
|
||
The advantage of indirect confirmation measure is that it computes similarity of words in :math:`W'` and
|
||
:math:`W^{*}` with respect to direct confirmations to all words. Eg. Suppose `x` and `z` are both competing
|
||
brands of cars, which semantically support each other. However, both brands are seldom mentioned
|
||
together in documents in the reference corpus. But their confirmations to other words like “road”
|
||
or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure.
|
||
Thus, indirect confirmation measures may capture semantic support that direct measures would miss.
|
||
|
||
The formula used to compute indirect confirmation measure is
|
||
|
||
.. math::
|
||
|
||
\widetilde{m}_{sim(m, \gamma)}(W', W^{*}) = s_{sim}(\vec{v}^{\,}_{m,\gamma}(W'), \vec{v}^{\,}_{m,\gamma}(W^{*}))
|
||
|
||
|
||
where :math:`s_{sim}` can be cosine, dice or jaccard similarity and
|
||
|
||
.. math::
|
||
|
||
\vec{v}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|}
|
||
|
||
"""
|
||
|
||
import itertools
|
||
import logging
|
||
|
||
import numpy as np
|
||
import scipy.sparse as sps
|
||
|
||
from gensim.topic_coherence.direct_confirmation_measure import aggregate_segment_sims, log_ratio_measure
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_support=False):
|
||
"""For each topic segmentation, compute average cosine similarity using a
|
||
:class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator`.
|
||
|
||
Parameters
|
||
----------
|
||
segmented_topics : list of lists of (int, `numpy.ndarray`)
|
||
Output from the :func:`~gensim.topic_coherence.segmentation.s_one_set`.
|
||
accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or
|
||
:class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
|
||
Word occurrence accumulator.
|
||
with_std : bool, optional
|
||
True to also include standard deviation across topic segment sets
|
||
in addition to the mean coherence for each topic.
|
||
with_support : bool, optional
|
||
True to also include support across topic segments. The support is defined as
|
||
the number of pairwise similarity comparisons were used to compute the overall topic coherence.
|
||
|
||
Returns
|
||
-------
|
||
list of (float[, float[, int]])
|
||
Сosine word2vec similarities per topic (with std/support if `with_std`, `with_support`).
|
||
|
||
Examples
|
||
--------
|
||
>>> import numpy as np
|
||
>>> from gensim.corpora.dictionary import Dictionary
|
||
>>> from gensim.topic_coherence import indirect_confirmation_measure
|
||
>>> from gensim.topic_coherence import text_analysis
|
||
>>>
|
||
>>> # create segmentation
|
||
>>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
|
||
>>>
|
||
>>> # create accumulator
|
||
>>> dictionary = Dictionary()
|
||
>>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
|
||
>>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary)
|
||
>>> _ = accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5)
|
||
>>>
|
||
>>> # should be (0.726752426218 0.00695475919227)
|
||
>>> mean, std = indirect_confirmation_measure.word2vec_similarity(segmentation, accumulator, with_std=True)[0]
|
||
|
||
"""
|
||
topic_coherences = []
|
||
total_oov = 0
|
||
|
||
for topic_index, topic_segments in enumerate(segmented_topics):
|
||
segment_sims = []
|
||
num_oov = 0
|
||
for w_prime, w_star in topic_segments:
|
||
if not hasattr(w_prime, '__iter__'):
|
||
w_prime = [w_prime]
|
||
if not hasattr(w_star, '__iter__'):
|
||
w_star = [w_star]
|
||
|
||
try:
|
||
segment_sims.append(accumulator.ids_similarity(w_prime, w_star))
|
||
except ZeroDivisionError:
|
||
num_oov += 1
|
||
|
||
if num_oov > 0:
|
||
total_oov += 1
|
||
logger.warning(
|
||
"%d terms for topic %d are not in word2vec model vocabulary",
|
||
num_oov, topic_index)
|
||
topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))
|
||
|
||
if total_oov > 0:
|
||
logger.warning("%d terms for are not in word2vec model vocabulary", total_oov)
|
||
return topic_coherences
|
||
|
||
|
||
def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr',
|
||
gamma=1, with_std=False, with_support=False):
|
||
"""Calculate the indirect cosine measure.
|
||
|
||
Parameters
|
||
----------
|
||
segmented_topics: list of lists of (int, `numpy.ndarray`)
|
||
Output from the segmentation module of the segmented topics.
|
||
accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
|
||
Output from the probability_estimation module. Is an topics: Topics obtained from the trained topic model.
|
||
measure : str, optional
|
||
Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio).
|
||
gamma: float, optional
|
||
Gamma value for computing :math:`W'` and :math:`W^{*}` vectors.
|
||
with_std : bool
|
||
True to also include standard deviation across topic segment sets in addition to the mean coherence
|
||
for each topic; default is False.
|
||
with_support : bool
|
||
True to also include support across topic segments. The support is defined as the number of pairwise similarity
|
||
comparisons were used to compute the overall topic coherence.
|
||
|
||
Returns
|
||
-------
|
||
list
|
||
List of indirect cosine similarity measure for each topic.
|
||
|
||
Examples
|
||
--------
|
||
>>> from gensim.corpora.dictionary import Dictionary
|
||
>>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis
|
||
>>> import numpy as np
|
||
>>>
|
||
>>> # create accumulator
|
||
>>> dictionary = Dictionary()
|
||
>>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
|
||
>>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary)
|
||
>>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}}
|
||
>>> accumulator._num_docs = 5
|
||
>>>
|
||
>>> # create topics
|
||
>>> topics = [np.array([1, 2])]
|
||
>>>
|
||
>>> # create segmentation
|
||
>>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]]
|
||
>>> obtained = indirect_confirmation_measure.cosine_similarity(segmentation, accumulator, topics, 'nlr', 1)
|
||
>>> print obtained[0]
|
||
0.623018926945
|
||
|
||
"""
|
||
context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma)
|
||
|
||
topic_coherences = []
|
||
for topic_words, topic_segments in zip(topics, segmented_topics):
|
||
topic_words = tuple(topic_words) # because tuples are hashable
|
||
segment_sims = np.zeros(len(topic_segments))
|
||
for i, (w_prime, w_star) in enumerate(topic_segments):
|
||
w_prime_cv = context_vectors[w_prime, topic_words]
|
||
w_star_cv = context_vectors[w_star, topic_words]
|
||
segment_sims[i] = _cossim(w_prime_cv, w_star_cv)
|
||
|
||
topic_coherences.append(aggregate_segment_sims(segment_sims, with_std, with_support))
|
||
|
||
return topic_coherences
|
||
|
||
|
||
class ContextVectorComputer(object):
|
||
"""Lazily compute context vectors for topic segments.
|
||
|
||
Parameters
|
||
----------
|
||
measure: str
|
||
Confirmation measure.
|
||
topics: list of numpy.array
|
||
Topics.
|
||
accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or
|
||
:class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
|
||
Word occurrence accumulator from probability_estimation.
|
||
gamma: float
|
||
Value for computing vectors.
|
||
|
||
Attributes
|
||
----------
|
||
sim_cache: dict
|
||
Cache similarities between tokens (pairs of word ids), e.g. (1, 2).
|
||
context_vector_cache: dict
|
||
Mapping from (segment, topic_words) --> context_vector.
|
||
|
||
Example
|
||
-------
|
||
>>> from gensim.corpora.dictionary import Dictionary
|
||
>>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis
|
||
>>> import numpy as np
|
||
>>>
|
||
>>> # create measure, topics
|
||
>>> measure = 'nlr'
|
||
>>> topics = [np.array([1, 2])]
|
||
>>>
|
||
>>> # create accumulator
|
||
>>> dictionary = Dictionary()
|
||
>>> dictionary.id2token = {1: 'fake', 2: 'tokens'}
|
||
>>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary)
|
||
>>> _ = accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5)
|
||
>>> cont_vect_comp = indirect_confirmation_measure.ContextVectorComputer(measure, topics, accumulator, 1)
|
||
>>> cont_vect_comp.mapping
|
||
{1: 0, 2: 1}
|
||
>>> cont_vect_comp.vocab_size
|
||
2
|
||
|
||
"""
|
||
|
||
def __init__(self, measure, topics, accumulator, gamma):
|
||
|
||
if measure == 'nlr':
|
||
self.similarity = _pair_npmi
|
||
else:
|
||
raise ValueError(
|
||
"The direct confirmation measure you entered is not currently supported.")
|
||
|
||
self.mapping = _map_to_contiguous(topics)
|
||
self.vocab_size = len(self.mapping)
|
||
self.accumulator = accumulator
|
||
self.gamma = gamma
|
||
self.sim_cache = {}
|
||
self.context_vector_cache = {}
|
||
|
||
def __getitem__(self, idx):
|
||
return self.compute_context_vector(*idx)
|
||
|
||
def compute_context_vector(self, segment_word_ids, topic_word_ids):
|
||
"""Check if (segment_word_ids, topic_word_ids) context vector has been cached.
|
||
|
||
Parameters
|
||
----------
|
||
segment_word_ids: list
|
||
Ids of words in segment.
|
||
topic_word_ids: list
|
||
Ids of words in topic.
|
||
Returns
|
||
-------
|
||
csr_matrix :class:`~scipy.sparse.csr`
|
||
If context vector has been cached, then return corresponding context vector,
|
||
else compute, cache, and return.
|
||
|
||
"""
|
||
key = _key_for_segment(segment_word_ids, topic_word_ids)
|
||
context_vector = self.context_vector_cache.get(key, None)
|
||
if context_vector is None:
|
||
context_vector = self._make_seg(segment_word_ids, topic_word_ids)
|
||
self.context_vector_cache[key] = context_vector
|
||
return context_vector
|
||
|
||
def _make_seg(self, segment_word_ids, topic_word_ids):
|
||
"""Return context vectors for segmentation (Internal helper function).
|
||
|
||
Parameters
|
||
----------
|
||
segment_word_ids : iterable or int
|
||
Ids of words in segment.
|
||
topic_word_ids : list
|
||
Ids of words in topic.
|
||
Returns
|
||
-------
|
||
csr_matrix :class:`~scipy.sparse.csr`
|
||
Matrix in Compressed Sparse Row format
|
||
|
||
"""
|
||
context_vector = sps.lil_matrix((self.vocab_size, 1))
|
||
if not hasattr(segment_word_ids, '__iter__'):
|
||
segment_word_ids = (segment_word_ids,)
|
||
|
||
for w_j in topic_word_ids:
|
||
idx = (self.mapping[w_j], 0)
|
||
for pair in (tuple(sorted((w_i, w_j))) for w_i in segment_word_ids):
|
||
if pair not in self.sim_cache:
|
||
self.sim_cache[pair] = self.similarity(pair, self.accumulator)
|
||
|
||
context_vector[idx] += self.sim_cache[pair] ** self.gamma
|
||
|
||
return context_vector.tocsr()
|
||
|
||
|
||
def _pair_npmi(pair, accumulator):
|
||
"""Compute normalized pairwise mutual information (**NPMI**) between a pair of words.
|
||
|
||
Parameters
|
||
----------
|
||
pair : (int, int)
|
||
The pair of words (word_id1, word_id2).
|
||
accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator`
|
||
Word occurrence accumulator from probability_estimation.
|
||
|
||
Return
|
||
------
|
||
float
|
||
NPMI between a pair of words.
|
||
|
||
"""
|
||
return log_ratio_measure([[pair]], accumulator, True)[0]
|
||
|
||
|
||
def _cossim(cv1, cv2):
|
||
return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))
|
||
|
||
|
||
def _magnitude(sparse_vec):
|
||
return np.sqrt(np.sum(sparse_vec.data ** 2))
|
||
|
||
|
||
def _map_to_contiguous(ids_iterable):
|
||
uniq_ids = {}
|
||
n = 0
|
||
for id_ in itertools.chain.from_iterable(ids_iterable):
|
||
if id_ not in uniq_ids:
|
||
uniq_ids[id_] = n
|
||
n += 1
|
||
return uniq_ids
|
||
|
||
|
||
def _key_for_segment(segment, topic_words):
|
||
"""A segment may have a single number of an iterable of them."""
|
||
segment_key = tuple(segment) if hasattr(segment, '__iter__') else segment
|
||
return segment_key, topic_words
|