laywerrobot/lib/python3.6/site-packages/gensim/models/coherencemodel.py
2020-08-27 21:55:39 +02:00

696 lines
25 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Calculate topic coherence for topic models. This is the implementation of the four stage topic coherence pipeline
from the paper `Michael Roeder, Andreas Both and Alexander Hinneburg: "Exploring the space of topic coherence measures"
<http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf>`_.
Typically, :class:`~gensim.models.coherencemodel.CoherenceModel` used for evaluation of topic models.
The four stage pipeline is basically:
* Segmentation
* Probability Estimation
* Confirmation Measure
* Aggregation
Implementation of this pipeline allows for the user to in essence "make" a coherence measure of his/her choice
by choosing a method in each of the pipelines.
See Also
--------
:mod:`gensim.topic_coherence`
Internal functions for pipelines.
"""
import logging
import multiprocessing as mp
from collections import namedtuple
import numpy as np
from gensim import interfaces, matutils
from gensim import utils
from gensim.topic_coherence import (segmentation, probability_estimation,
direct_confirmation_measure, indirect_confirmation_measure,
aggregation)
from gensim.topic_coherence.probability_estimation import unique_ids_from_segments
logger = logging.getLogger(__name__)
BOOLEAN_DOCUMENT_BASED = {'u_mass'}
SLIDING_WINDOW_BASED = {'c_v', 'c_uci', 'c_npmi', 'c_w2v'}
_make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr')
COHERENCE_MEASURES = {
'u_mass': _make_pipeline(
segmentation.s_one_pre,
probability_estimation.p_boolean_document,
direct_confirmation_measure.log_conditional_probability,
aggregation.arithmetic_mean
),
'c_v': _make_pipeline(
segmentation.s_one_set,
probability_estimation.p_boolean_sliding_window,
indirect_confirmation_measure.cosine_similarity,
aggregation.arithmetic_mean
),
'c_w2v': _make_pipeline(
segmentation.s_one_set,
probability_estimation.p_word2vec,
indirect_confirmation_measure.word2vec_similarity,
aggregation.arithmetic_mean
),
'c_uci': _make_pipeline(
segmentation.s_one_one,
probability_estimation.p_boolean_sliding_window,
direct_confirmation_measure.log_ratio_measure,
aggregation.arithmetic_mean
),
'c_npmi': _make_pipeline(
segmentation.s_one_one,
probability_estimation.p_boolean_sliding_window,
direct_confirmation_measure.log_ratio_measure,
aggregation.arithmetic_mean
),
}
SLIDING_WINDOW_SIZES = {
'c_v': 110,
'c_w2v': 5,
'c_uci': 10,
'c_npmi': 10,
'u_mass': None
}
class CoherenceModel(interfaces.TransformationABC):
"""Objects of this class allow for building and maintaining a model for topic coherence.
Examples
---------
One way of using this feature is through providing a trained topic model. A dictionary has to be explicitly provided
if the model does not contain a dictionary already
>>> from gensim.test.utils import common_corpus, common_dictionary
>>> from gensim.models.ldamodel import LdaModel
>>> from gensim.models.coherencemodel import CoherenceModel
>>>
>>> model = LdaModel(common_corpus, 5, common_dictionary)
>>>
>>> cm = CoherenceModel(model=model, corpus=common_corpus, coherence='u_mass')
>>> coherence = cm.get_coherence() # get coherence value
Another way of using this feature is through providing tokenized topics such as
>>> from gensim.test.utils import common_corpus, common_dictionary
>>> from gensim.models.coherencemodel import CoherenceModel
>>> topics = [
... ['human', 'computer', 'system', 'interface'],
... ['graph', 'minors', 'trees', 'eps']
... ]
>>>
>>> cm = CoherenceModel(topics=topics, corpus=common_corpus, dictionary=common_dictionary, coherence='u_mass')
>>> coherence = cm.get_coherence() # get coherence value
"""
def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None,
window_size=None, keyed_vectors=None, coherence='c_v', topn=20, processes=-1):
"""
Parameters
----------
model : :class:`~gensim.models.basemodel.BaseTopicModel`, optional
Pre-trained topic model, should be provided if topics is not provided.
Currently supports :class:`~gensim.models.ldamodel.LdaModel`,
:class:`~gensim.models.ldamulticore.LdaMulticore`, :class:`~gensim.models.wrappers.ldamallet.LdaMallet` and
:class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit`.
Use `topics` parameter to plug in an as yet unsupported model.
topics : list of list of str, optional
List of tokenized topics, if this is preferred over model - dictionary should be provided.
texts : list of list of str, optional
Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`)
probability estimator .
corpus : iterable of list of (int, number), optional
Corpus in BoW format.
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
Gensim dictionary mapping of id word to create corpus.
If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used.
window_size : int, optional
Is the size of the window to be used for coherence measures using boolean sliding window as their
probability estimator. For 'u_mass' this doesn't matter.
If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10.
coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional
Coherence measure to be used.
Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`.
For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus
using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed)
topn : int, optional
Integer corresponding to the number of top words to be extracted from each topic.
processes : int, optional
Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as
num_cpus - 1.
"""
if model is None and topics is None:
raise ValueError("One of model or topics has to be provided.")
elif topics is not None and dictionary is None:
raise ValueError("dictionary has to be provided if topics are to be used.")
self.keyed_vectors = keyed_vectors
if keyed_vectors is None and texts is None and corpus is None:
raise ValueError("One of texts or corpus has to be provided.")
# Check if associated dictionary is provided.
if dictionary is None:
if isinstance(model.id2word, utils.FakeDict):
raise ValueError(
"The associated dictionary should be provided with the corpus or 'id2word'"
" for topic model should be set as the associated dictionary.")
else:
self.dictionary = model.id2word
else:
self.dictionary = dictionary
# Check for correct inputs for u_mass coherence measure.
self.coherence = coherence
self.window_size = window_size
if self.window_size is None:
self.window_size = SLIDING_WINDOW_SIZES[self.coherence]
self.texts = texts
self.corpus = corpus
if coherence in BOOLEAN_DOCUMENT_BASED:
if utils.is_corpus(corpus)[0]:
self.corpus = corpus
elif self.texts is not None:
self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
else:
raise ValueError(
"Either 'corpus' with 'dictionary' or 'texts' should "
"be provided for %s coherence.", coherence)
# Check for correct inputs for sliding window coherence measure.
elif coherence == 'c_w2v' and keyed_vectors is not None:
pass
elif coherence in SLIDING_WINDOW_BASED:
if self.texts is None:
raise ValueError("'texts' should be provided for %s coherence.", coherence)
else:
raise ValueError("%s coherence is not currently supported.", coherence)
self._topn = topn
self._model = model
self._accumulator = None
self._topics = None
self.topics = topics
self.processes = processes if processes >= 1 else max(1, mp.cpu_count() - 1)
@classmethod
def for_models(cls, models, dictionary, topn=20, **kwargs):
"""Initialize a CoherenceModel with estimated probabilities for all of the given models.
Use :meth:`~gensim.models.coherencemodel.CoherenceModel.for_topics` method.
Parameters
----------
models : list of :class:`~gensim.models.basemodel.BaseTopicModel`
List of models to evaluate coherence of, each of it should implements
:meth:`~gensim.models.basemodel.BaseTopicModel.get_topics` method.
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
Gensim dictionary mapping of id word.
topn : int, optional
Integer corresponding to the number of top words to be extracted from each topic.
kwargs : object
Sequence of arguments, see :meth:`~gensim.models.coherencemodel.CoherenceModel.for_topics`.
Return
------
:class:`~gensim.models.coherencemodel.CoherenceModel`
CoherenceModel with estimated probabilities for all of the given models.
Example
-------
>>> from gensim.test.utils import common_corpus, common_dictionary
>>> from gensim.models.ldamodel import LdaModel
>>> from gensim.models.coherencemodel import CoherenceModel
>>>
>>> m1 = LdaModel(common_corpus, 3, common_dictionary)
>>> m2 = LdaModel(common_corpus, 5, common_dictionary)
>>>
>>> cm = CoherenceModel.for_models([m1, m2], common_dictionary, corpus=common_corpus, coherence='u_mass')
"""
topics = [cls.top_topics_as_word_lists(model, dictionary, topn) for model in models]
kwargs['dictionary'] = dictionary
kwargs['topn'] = topn
return cls.for_topics(topics, **kwargs)
@staticmethod
def top_topics_as_word_lists(model, dictionary, topn=20):
"""Get `topn` topics as list of words.
Parameters
----------
model : :class:`~gensim.models.basemodel.BaseTopicModel`
Pre-trained topic model.
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
Gensim dictionary mapping of id word.
topn : int, optional
Integer corresponding to the number of top words to be extracted from each topic.
Return
------
list of list of str
Top topics in list-of-list-of-words format.
"""
if not dictionary.id2token:
dictionary.id2token = {v: k for k, v in dictionary.token2id.items()}
str_topics = []
for topic in model.get_topics():
bestn = matutils.argsort(topic, topn=topn, reverse=True)
beststr = [dictionary.id2token[_id] for _id in bestn]
str_topics.append(beststr)
return str_topics
@classmethod
def for_topics(cls, topics_as_topn_terms, **kwargs):
"""Initialize a CoherenceModel with estimated probabilities for all of the given topics.
Parameters
----------
topics_as_topn_terms : list of list of str
Each element in the top-level list should be the list of topics for a model.
The topics for the model should be a list of top-N words, one per topic.
Return
------
:class:`~gensim.models.coherencemodel.CoherenceModel`
CoherenceModel with estimated probabilities for all of the given models.
"""
if not topics_as_topn_terms:
raise ValueError("len(topics) must be > 0.")
if any(len(topic_lists) == 0 for topic_lists in topics_as_topn_terms):
raise ValueError("found empty topic listing in `topics`")
topn = 0
for topic_list in topics_as_topn_terms:
for topic in topic_list:
topn = max(topn, len(topic))
topn = min(kwargs.pop('topn', topn), topn)
super_topic = utils.flatten(topics_as_topn_terms)
logging.info(
"Number of relevant terms for all %d models: %d",
len(topics_as_topn_terms), len(super_topic))
cm = CoherenceModel(topics=[super_topic], topn=len(super_topic), **kwargs)
cm.estimate_probabilities()
cm.topn = topn
return cm
def __str__(self):
return str(self.measure)
@property
def model(self):
"""Get `self._model` field.
Return
------
:class:`~gensim.models.basemodel.BaseTopicModel`
Used model.
"""
return self._model
@model.setter
def model(self, model):
"""Set `self._model` field.
Parameters
----------
model : :class:`~gensim.models.basemodel.BaseTopicModel`
Input model.
"""
self._model = model
if model is not None:
new_topics = self._get_topics()
self._update_accumulator(new_topics)
self._topics = new_topics
@property
def topn(self):
"""Get number of top words `self._topn`.
Return
------
int
Integer corresponding to the number of top words.
"""
return self._topn
@topn.setter
def topn(self, topn):
"""Set number of top words `self._topn`.
Parameters
----------
topn : int
Number of top words.
"""
current_topic_length = len(self._topics[0])
requires_expansion = current_topic_length < topn
if self.model is not None:
self._topn = topn
if requires_expansion:
self.model = self._model # trigger topic expansion from model
else:
if requires_expansion:
raise ValueError("Model unavailable and topic sizes are less than topn=%d" % topn)
self._topn = topn # topics will be truncated in getter
@property
def measure(self):
"""Make pipeline, according to `coherence` parameter value.
Return
------
namedtuple
Pipeline that contains needed functions/method for calculated coherence.
"""
return COHERENCE_MEASURES[self.coherence]
@property
def topics(self):
"""Get topics `self._topics`.
Return
------
list of list of str
Topics as list of tokens.
"""
if len(self._topics[0]) > self._topn:
return [topic[:self._topn] for topic in self._topics]
else:
return self._topics
@topics.setter
def topics(self, topics):
"""Set topics `self._topics`.
Parameters
----------
topics : list of list of str
Topics.
"""
if topics is not None:
new_topics = []
for topic in topics:
topic_token_ids = self._ensure_elements_are_ids(topic)
new_topics.append(topic_token_ids)
if self.model is not None:
logger.warning(
"The currently set model '%s' may be inconsistent with the newly set topics",
self.model)
elif self.model is not None:
new_topics = self._get_topics()
logger.debug("Setting topics to those of the model: %s", self.model)
else:
new_topics = None
self._update_accumulator(new_topics)
self._topics = new_topics
def _ensure_elements_are_ids(self, topic):
try:
return np.array([self.dictionary.token2id[token] for token in topic])
except KeyError: # might be a list of token ids already, but let's verify all in dict
topic = [self.dictionary.id2token[_id] for _id in topic]
return np.array([self.dictionary.token2id[token] for token in topic])
def _update_accumulator(self, new_topics):
if self._relevant_ids_will_differ(new_topics):
logger.debug("Wiping cached accumulator since it does not contain all relevant ids.")
self._accumulator = None
def _relevant_ids_will_differ(self, new_topics):
if self._accumulator is None or not self._topics_differ(new_topics):
return False
new_set = unique_ids_from_segments(self.measure.seg(new_topics))
return not self._accumulator.relevant_ids.issuperset(new_set)
def _topics_differ(self, new_topics):
return (new_topics is not None and
self._topics is not None and
not np.array_equal(new_topics, self._topics))
def _get_topics(self):
"""Internal helper function to return topics from a trained topic model."""
return self._get_topics_from_model(self.model, self.topn)
@staticmethod
def _get_topics_from_model(model, topn):
"""Internal helper function to return topics from a trained topic model.
Parameters
----------
model : :class:`~gensim.models.basemodel.BaseTopicModel`
Pre-trained topic model.
topn : int
Integer corresponding to the number of top words.
Return
------
list of :class:`numpy.ndarray`
Topics matrix
"""
try:
return [
matutils.argsort(topic, topn=topn, reverse=True) for topic in
model.get_topics()
]
except AttributeError:
raise ValueError(
"This topic model is not currently supported. Supported topic models"
" should implement the `get_topics` method.")
def segment_topics(self):
"""Segment topic, alias for `self.measure.seg(self.topics)`.
Return
------
list of list of pair
Segmented topics.
"""
return self.measure.seg(self.topics)
def estimate_probabilities(self, segmented_topics=None):
"""Accumulate word occurrences and co-occurrences from texts or corpus using the optimal method for the chosen
coherence metric.
Notes
-----
This operation may take quite some time for the sliding window based coherence methods.
Parameters
----------
segmented_topics : list of list of pair, optional
Segmented topics, typically produced by :meth:`~gensim.models.coherencemodel.CoherenceModel.segment_topics`.
Return
------
:class:`~gensim.topic_coherence.text_analysis.CorpusAccumulator`
Corpus accumulator.
"""
if segmented_topics is None:
segmented_topics = self.segment_topics()
if self.coherence in BOOLEAN_DOCUMENT_BASED:
self._accumulator = self.measure.prob(self.corpus, segmented_topics)
else:
kwargs = dict(
texts=self.texts, segmented_topics=segmented_topics,
dictionary=self.dictionary, window_size=self.window_size,
processes=self.processes)
if self.coherence == 'c_w2v':
kwargs['model'] = self.keyed_vectors
self._accumulator = self.measure.prob(**kwargs)
return self._accumulator
def get_coherence_per_topic(self, segmented_topics=None, with_std=False, with_support=False):
"""Get list of coherence values for each topic based on pipeline parameters.
Parameters
----------
segmented_topics : list of list of (int, number)
Topics.
with_std : bool, optional
True to also include standard deviation across topic segment sets in addition to the mean coherence
for each topic.
with_support : bool, optional
True to also include support across topic segments. The support is defined as the number of pairwise
similarity comparisons were used to compute the overall topic coherence.
Return
------
list of float
Sequence of similarity measure for each topic.
"""
measure = self.measure
if segmented_topics is None:
segmented_topics = measure.seg(self.topics)
if self._accumulator is None:
self.estimate_probabilities(segmented_topics)
kwargs = dict(with_std=with_std, with_support=with_support)
if self.coherence in BOOLEAN_DOCUMENT_BASED or self.coherence == 'c_w2v':
pass
elif self.coherence == 'c_v':
kwargs['topics'] = self.topics
kwargs['measure'] = 'nlr'
kwargs['gamma'] = 1
else:
kwargs['normalize'] = (self.coherence == 'c_npmi')
return measure.conf(segmented_topics, self._accumulator, **kwargs)
def aggregate_measures(self, topic_coherences):
"""Aggregate the individual topic coherence measures using the pipeline's aggregation function.
Use `self.measure.aggr(topic_coherences)`.
Parameters
----------
topic_coherences : list of float
List of calculated confirmation measure on each set in the segmented topics.
Returns
-------
float
Arithmetic mean of all the values contained in confirmation measures.
"""
return self.measure.aggr(topic_coherences)
def get_coherence(self):
"""Get coherence value based on pipeline parameters.
Returns
-------
float
Value of coherence.
"""
confirmed_measures = self.get_coherence_per_topic()
return self.aggregate_measures(confirmed_measures)
def compare_models(self, models):
"""Compare topic models by coherence value.
Parameters
----------
models : :class:`~gensim.models.basemodel.BaseTopicModel`
Sequence of topic models.
Returns
-------
list of (float, float)
Sequence of pairs of average topic coherence and average coherence.
"""
model_topics = [self._get_topics_from_model(model, self.topn) for model in models]
return self.compare_model_topics(model_topics)
def compare_model_topics(self, model_topics):
"""Perform the coherence evaluation for each of the models.
Parameters
----------
model_topics : list of list of str
list of list of words for the model trained with that number of topics.
Returns
-------
list of (float, float)
Sequence of pairs of average topic coherence and average coherence.
Notes
-----
This first precomputes the probabilities once, then evaluates coherence for each model.
Since we have already precomputed the probabilities, this simply involves using the accumulated stats in the
:class:`~gensim.models.coherencemodel.CoherenceModel` to perform the evaluations, which should be pretty quick.
"""
orig_topics = self._topics
orig_topn = self.topn
try:
coherences = self._compare_model_topics(model_topics)
finally:
self.topics = orig_topics
self.topn = orig_topn
return coherences
def _compare_model_topics(self, model_topics):
"""Get average topic and model coherences.
Parameters
----------
model_topics : list of list of str
Topics from the model.
Returns
-------
list of (float, float)
Sequence of pairs of average topic coherence and average coherence.
"""
coherences = []
last_topn_value = min(self.topn - 1, 4)
topn_grid = list(range(self.topn, last_topn_value, -5))
for model_num, topics in enumerate(model_topics):
self.topics = topics
# We evaluate at various values of N and average them. This is a more robust,
# according to: http://people.eng.unimelb.edu.au/tbaldwin/pubs/naacl2016.pdf
coherence_at_n = {}
for n in topn_grid:
self.topn = n
topic_coherences = self.get_coherence_per_topic()
# Let's record the coherences for each topic, as well as the aggregated
# coherence across all of the topics.
# Some of them may be nan (if all words were OOV), so do mean value imputation.
filled_coherences = np.array(topic_coherences)
filled_coherences[np.isnan(filled_coherences)] = np.nanmean(filled_coherences)
coherence_at_n[n] = (topic_coherences, self.aggregate_measures(filled_coherences))
topic_coherences, avg_coherences = zip(*coherence_at_n.values())
avg_topic_coherences = np.vstack(topic_coherences).mean(0)
model_coherence = np.mean(avg_coherences)
logging.info("Avg coherence for model %d: %.5f" % (model_num, model_coherence))
coherences.append((avg_topic_coherences, model_coherence))
return coherences