696 lines
25 KiB
Python
696 lines
25 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
|
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
|
|
|
"""Calculate topic coherence for topic models. This is the implementation of the four stage topic coherence pipeline
|
|
from the paper `Michael Roeder, Andreas Both and Alexander Hinneburg: "Exploring the space of topic coherence measures"
|
|
<http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf>`_.
|
|
Typically, :class:`~gensim.models.coherencemodel.CoherenceModel` used for evaluation of topic models.
|
|
|
|
The four stage pipeline is basically:
|
|
|
|
* Segmentation
|
|
* Probability Estimation
|
|
* Confirmation Measure
|
|
* Aggregation
|
|
|
|
Implementation of this pipeline allows for the user to in essence "make" a coherence measure of his/her choice
|
|
by choosing a method in each of the pipelines.
|
|
|
|
See Also
|
|
--------
|
|
:mod:`gensim.topic_coherence`
|
|
Internal functions for pipelines.
|
|
|
|
"""
|
|
import logging
|
|
import multiprocessing as mp
|
|
from collections import namedtuple
|
|
|
|
import numpy as np
|
|
|
|
from gensim import interfaces, matutils
|
|
from gensim import utils
|
|
from gensim.topic_coherence import (segmentation, probability_estimation,
|
|
direct_confirmation_measure, indirect_confirmation_measure,
|
|
aggregation)
|
|
from gensim.topic_coherence.probability_estimation import unique_ids_from_segments
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
BOOLEAN_DOCUMENT_BASED = {'u_mass'}
|
|
SLIDING_WINDOW_BASED = {'c_v', 'c_uci', 'c_npmi', 'c_w2v'}
|
|
|
|
_make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr')
|
|
COHERENCE_MEASURES = {
|
|
'u_mass': _make_pipeline(
|
|
segmentation.s_one_pre,
|
|
probability_estimation.p_boolean_document,
|
|
direct_confirmation_measure.log_conditional_probability,
|
|
aggregation.arithmetic_mean
|
|
),
|
|
'c_v': _make_pipeline(
|
|
segmentation.s_one_set,
|
|
probability_estimation.p_boolean_sliding_window,
|
|
indirect_confirmation_measure.cosine_similarity,
|
|
aggregation.arithmetic_mean
|
|
),
|
|
'c_w2v': _make_pipeline(
|
|
segmentation.s_one_set,
|
|
probability_estimation.p_word2vec,
|
|
indirect_confirmation_measure.word2vec_similarity,
|
|
aggregation.arithmetic_mean
|
|
),
|
|
'c_uci': _make_pipeline(
|
|
segmentation.s_one_one,
|
|
probability_estimation.p_boolean_sliding_window,
|
|
direct_confirmation_measure.log_ratio_measure,
|
|
aggregation.arithmetic_mean
|
|
),
|
|
'c_npmi': _make_pipeline(
|
|
segmentation.s_one_one,
|
|
probability_estimation.p_boolean_sliding_window,
|
|
direct_confirmation_measure.log_ratio_measure,
|
|
aggregation.arithmetic_mean
|
|
),
|
|
}
|
|
|
|
SLIDING_WINDOW_SIZES = {
|
|
'c_v': 110,
|
|
'c_w2v': 5,
|
|
'c_uci': 10,
|
|
'c_npmi': 10,
|
|
'u_mass': None
|
|
}
|
|
|
|
|
|
class CoherenceModel(interfaces.TransformationABC):
|
|
"""Objects of this class allow for building and maintaining a model for topic coherence.
|
|
|
|
Examples
|
|
---------
|
|
One way of using this feature is through providing a trained topic model. A dictionary has to be explicitly provided
|
|
if the model does not contain a dictionary already
|
|
|
|
>>> from gensim.test.utils import common_corpus, common_dictionary
|
|
>>> from gensim.models.ldamodel import LdaModel
|
|
>>> from gensim.models.coherencemodel import CoherenceModel
|
|
>>>
|
|
>>> model = LdaModel(common_corpus, 5, common_dictionary)
|
|
>>>
|
|
>>> cm = CoherenceModel(model=model, corpus=common_corpus, coherence='u_mass')
|
|
>>> coherence = cm.get_coherence() # get coherence value
|
|
|
|
Another way of using this feature is through providing tokenized topics such as
|
|
|
|
>>> from gensim.test.utils import common_corpus, common_dictionary
|
|
>>> from gensim.models.coherencemodel import CoherenceModel
|
|
>>> topics = [
|
|
... ['human', 'computer', 'system', 'interface'],
|
|
... ['graph', 'minors', 'trees', 'eps']
|
|
... ]
|
|
>>>
|
|
>>> cm = CoherenceModel(topics=topics, corpus=common_corpus, dictionary=common_dictionary, coherence='u_mass')
|
|
>>> coherence = cm.get_coherence() # get coherence value
|
|
|
|
"""
|
|
def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None,
|
|
window_size=None, keyed_vectors=None, coherence='c_v', topn=20, processes=-1):
|
|
"""
|
|
|
|
Parameters
|
|
----------
|
|
model : :class:`~gensim.models.basemodel.BaseTopicModel`, optional
|
|
Pre-trained topic model, should be provided if topics is not provided.
|
|
Currently supports :class:`~gensim.models.ldamodel.LdaModel`,
|
|
:class:`~gensim.models.ldamulticore.LdaMulticore`, :class:`~gensim.models.wrappers.ldamallet.LdaMallet` and
|
|
:class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit`.
|
|
Use `topics` parameter to plug in an as yet unsupported model.
|
|
topics : list of list of str, optional
|
|
List of tokenized topics, if this is preferred over model - dictionary should be provided.
|
|
texts : list of list of str, optional
|
|
Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`)
|
|
probability estimator .
|
|
corpus : iterable of list of (int, number), optional
|
|
Corpus in BoW format.
|
|
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
|
|
Gensim dictionary mapping of id word to create corpus.
|
|
If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used.
|
|
window_size : int, optional
|
|
Is the size of the window to be used for coherence measures using boolean sliding window as their
|
|
probability estimator. For 'u_mass' this doesn't matter.
|
|
If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10.
|
|
coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional
|
|
Coherence measure to be used.
|
|
Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`.
|
|
For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus
|
|
using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed)
|
|
topn : int, optional
|
|
Integer corresponding to the number of top words to be extracted from each topic.
|
|
processes : int, optional
|
|
Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as
|
|
num_cpus - 1.
|
|
|
|
"""
|
|
if model is None and topics is None:
|
|
raise ValueError("One of model or topics has to be provided.")
|
|
elif topics is not None and dictionary is None:
|
|
raise ValueError("dictionary has to be provided if topics are to be used.")
|
|
|
|
self.keyed_vectors = keyed_vectors
|
|
if keyed_vectors is None and texts is None and corpus is None:
|
|
raise ValueError("One of texts or corpus has to be provided.")
|
|
|
|
# Check if associated dictionary is provided.
|
|
if dictionary is None:
|
|
if isinstance(model.id2word, utils.FakeDict):
|
|
raise ValueError(
|
|
"The associated dictionary should be provided with the corpus or 'id2word'"
|
|
" for topic model should be set as the associated dictionary.")
|
|
else:
|
|
self.dictionary = model.id2word
|
|
else:
|
|
self.dictionary = dictionary
|
|
|
|
# Check for correct inputs for u_mass coherence measure.
|
|
self.coherence = coherence
|
|
self.window_size = window_size
|
|
if self.window_size is None:
|
|
self.window_size = SLIDING_WINDOW_SIZES[self.coherence]
|
|
self.texts = texts
|
|
self.corpus = corpus
|
|
|
|
if coherence in BOOLEAN_DOCUMENT_BASED:
|
|
if utils.is_corpus(corpus)[0]:
|
|
self.corpus = corpus
|
|
elif self.texts is not None:
|
|
self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
|
|
else:
|
|
raise ValueError(
|
|
"Either 'corpus' with 'dictionary' or 'texts' should "
|
|
"be provided for %s coherence.", coherence)
|
|
|
|
# Check for correct inputs for sliding window coherence measure.
|
|
elif coherence == 'c_w2v' and keyed_vectors is not None:
|
|
pass
|
|
elif coherence in SLIDING_WINDOW_BASED:
|
|
if self.texts is None:
|
|
raise ValueError("'texts' should be provided for %s coherence.", coherence)
|
|
else:
|
|
raise ValueError("%s coherence is not currently supported.", coherence)
|
|
|
|
self._topn = topn
|
|
self._model = model
|
|
self._accumulator = None
|
|
self._topics = None
|
|
self.topics = topics
|
|
|
|
self.processes = processes if processes >= 1 else max(1, mp.cpu_count() - 1)
|
|
|
|
@classmethod
|
|
def for_models(cls, models, dictionary, topn=20, **kwargs):
|
|
"""Initialize a CoherenceModel with estimated probabilities for all of the given models.
|
|
Use :meth:`~gensim.models.coherencemodel.CoherenceModel.for_topics` method.
|
|
|
|
Parameters
|
|
----------
|
|
models : list of :class:`~gensim.models.basemodel.BaseTopicModel`
|
|
List of models to evaluate coherence of, each of it should implements
|
|
:meth:`~gensim.models.basemodel.BaseTopicModel.get_topics` method.
|
|
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
|
|
Gensim dictionary mapping of id word.
|
|
topn : int, optional
|
|
Integer corresponding to the number of top words to be extracted from each topic.
|
|
kwargs : object
|
|
Sequence of arguments, see :meth:`~gensim.models.coherencemodel.CoherenceModel.for_topics`.
|
|
|
|
Return
|
|
------
|
|
:class:`~gensim.models.coherencemodel.CoherenceModel`
|
|
CoherenceModel with estimated probabilities for all of the given models.
|
|
|
|
Example
|
|
-------
|
|
>>> from gensim.test.utils import common_corpus, common_dictionary
|
|
>>> from gensim.models.ldamodel import LdaModel
|
|
>>> from gensim.models.coherencemodel import CoherenceModel
|
|
>>>
|
|
>>> m1 = LdaModel(common_corpus, 3, common_dictionary)
|
|
>>> m2 = LdaModel(common_corpus, 5, common_dictionary)
|
|
>>>
|
|
>>> cm = CoherenceModel.for_models([m1, m2], common_dictionary, corpus=common_corpus, coherence='u_mass')
|
|
"""
|
|
topics = [cls.top_topics_as_word_lists(model, dictionary, topn) for model in models]
|
|
kwargs['dictionary'] = dictionary
|
|
kwargs['topn'] = topn
|
|
return cls.for_topics(topics, **kwargs)
|
|
|
|
@staticmethod
|
|
def top_topics_as_word_lists(model, dictionary, topn=20):
|
|
"""Get `topn` topics as list of words.
|
|
|
|
Parameters
|
|
----------
|
|
model : :class:`~gensim.models.basemodel.BaseTopicModel`
|
|
Pre-trained topic model.
|
|
dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
|
|
Gensim dictionary mapping of id word.
|
|
topn : int, optional
|
|
Integer corresponding to the number of top words to be extracted from each topic.
|
|
|
|
Return
|
|
------
|
|
list of list of str
|
|
Top topics in list-of-list-of-words format.
|
|
|
|
"""
|
|
if not dictionary.id2token:
|
|
dictionary.id2token = {v: k for k, v in dictionary.token2id.items()}
|
|
|
|
str_topics = []
|
|
for topic in model.get_topics():
|
|
bestn = matutils.argsort(topic, topn=topn, reverse=True)
|
|
beststr = [dictionary.id2token[_id] for _id in bestn]
|
|
str_topics.append(beststr)
|
|
return str_topics
|
|
|
|
@classmethod
|
|
def for_topics(cls, topics_as_topn_terms, **kwargs):
|
|
"""Initialize a CoherenceModel with estimated probabilities for all of the given topics.
|
|
|
|
Parameters
|
|
----------
|
|
topics_as_topn_terms : list of list of str
|
|
Each element in the top-level list should be the list of topics for a model.
|
|
The topics for the model should be a list of top-N words, one per topic.
|
|
|
|
Return
|
|
------
|
|
:class:`~gensim.models.coherencemodel.CoherenceModel`
|
|
CoherenceModel with estimated probabilities for all of the given models.
|
|
|
|
"""
|
|
if not topics_as_topn_terms:
|
|
raise ValueError("len(topics) must be > 0.")
|
|
if any(len(topic_lists) == 0 for topic_lists in topics_as_topn_terms):
|
|
raise ValueError("found empty topic listing in `topics`")
|
|
|
|
topn = 0
|
|
for topic_list in topics_as_topn_terms:
|
|
for topic in topic_list:
|
|
topn = max(topn, len(topic))
|
|
|
|
topn = min(kwargs.pop('topn', topn), topn)
|
|
super_topic = utils.flatten(topics_as_topn_terms)
|
|
|
|
logging.info(
|
|
"Number of relevant terms for all %d models: %d",
|
|
len(topics_as_topn_terms), len(super_topic))
|
|
cm = CoherenceModel(topics=[super_topic], topn=len(super_topic), **kwargs)
|
|
cm.estimate_probabilities()
|
|
cm.topn = topn
|
|
return cm
|
|
|
|
def __str__(self):
|
|
return str(self.measure)
|
|
|
|
@property
|
|
def model(self):
|
|
"""Get `self._model` field.
|
|
|
|
Return
|
|
------
|
|
:class:`~gensim.models.basemodel.BaseTopicModel`
|
|
Used model.
|
|
|
|
"""
|
|
return self._model
|
|
|
|
@model.setter
|
|
def model(self, model):
|
|
"""Set `self._model` field.
|
|
|
|
Parameters
|
|
----------
|
|
model : :class:`~gensim.models.basemodel.BaseTopicModel`
|
|
Input model.
|
|
|
|
"""
|
|
self._model = model
|
|
if model is not None:
|
|
new_topics = self._get_topics()
|
|
self._update_accumulator(new_topics)
|
|
self._topics = new_topics
|
|
|
|
@property
|
|
def topn(self):
|
|
"""Get number of top words `self._topn`.
|
|
|
|
Return
|
|
------
|
|
int
|
|
Integer corresponding to the number of top words.
|
|
|
|
"""
|
|
return self._topn
|
|
|
|
@topn.setter
|
|
def topn(self, topn):
|
|
"""Set number of top words `self._topn`.
|
|
|
|
Parameters
|
|
----------
|
|
topn : int
|
|
Number of top words.
|
|
|
|
"""
|
|
current_topic_length = len(self._topics[0])
|
|
requires_expansion = current_topic_length < topn
|
|
|
|
if self.model is not None:
|
|
self._topn = topn
|
|
if requires_expansion:
|
|
self.model = self._model # trigger topic expansion from model
|
|
else:
|
|
if requires_expansion:
|
|
raise ValueError("Model unavailable and topic sizes are less than topn=%d" % topn)
|
|
self._topn = topn # topics will be truncated in getter
|
|
|
|
@property
|
|
def measure(self):
|
|
"""Make pipeline, according to `coherence` parameter value.
|
|
|
|
Return
|
|
------
|
|
namedtuple
|
|
Pipeline that contains needed functions/method for calculated coherence.
|
|
|
|
"""
|
|
return COHERENCE_MEASURES[self.coherence]
|
|
|
|
@property
|
|
def topics(self):
|
|
"""Get topics `self._topics`.
|
|
|
|
Return
|
|
------
|
|
list of list of str
|
|
Topics as list of tokens.
|
|
|
|
"""
|
|
if len(self._topics[0]) > self._topn:
|
|
return [topic[:self._topn] for topic in self._topics]
|
|
else:
|
|
return self._topics
|
|
|
|
@topics.setter
|
|
def topics(self, topics):
|
|
"""Set topics `self._topics`.
|
|
|
|
Parameters
|
|
----------
|
|
topics : list of list of str
|
|
Topics.
|
|
|
|
"""
|
|
if topics is not None:
|
|
new_topics = []
|
|
for topic in topics:
|
|
topic_token_ids = self._ensure_elements_are_ids(topic)
|
|
new_topics.append(topic_token_ids)
|
|
|
|
if self.model is not None:
|
|
logger.warning(
|
|
"The currently set model '%s' may be inconsistent with the newly set topics",
|
|
self.model)
|
|
elif self.model is not None:
|
|
new_topics = self._get_topics()
|
|
logger.debug("Setting topics to those of the model: %s", self.model)
|
|
else:
|
|
new_topics = None
|
|
|
|
self._update_accumulator(new_topics)
|
|
self._topics = new_topics
|
|
|
|
def _ensure_elements_are_ids(self, topic):
|
|
try:
|
|
return np.array([self.dictionary.token2id[token] for token in topic])
|
|
except KeyError: # might be a list of token ids already, but let's verify all in dict
|
|
topic = [self.dictionary.id2token[_id] for _id in topic]
|
|
return np.array([self.dictionary.token2id[token] for token in topic])
|
|
|
|
def _update_accumulator(self, new_topics):
|
|
if self._relevant_ids_will_differ(new_topics):
|
|
logger.debug("Wiping cached accumulator since it does not contain all relevant ids.")
|
|
self._accumulator = None
|
|
|
|
def _relevant_ids_will_differ(self, new_topics):
|
|
if self._accumulator is None or not self._topics_differ(new_topics):
|
|
return False
|
|
|
|
new_set = unique_ids_from_segments(self.measure.seg(new_topics))
|
|
return not self._accumulator.relevant_ids.issuperset(new_set)
|
|
|
|
def _topics_differ(self, new_topics):
|
|
return (new_topics is not None and
|
|
self._topics is not None and
|
|
not np.array_equal(new_topics, self._topics))
|
|
|
|
def _get_topics(self):
|
|
"""Internal helper function to return topics from a trained topic model."""
|
|
return self._get_topics_from_model(self.model, self.topn)
|
|
|
|
@staticmethod
|
|
def _get_topics_from_model(model, topn):
|
|
"""Internal helper function to return topics from a trained topic model.
|
|
|
|
Parameters
|
|
----------
|
|
model : :class:`~gensim.models.basemodel.BaseTopicModel`
|
|
Pre-trained topic model.
|
|
topn : int
|
|
Integer corresponding to the number of top words.
|
|
|
|
Return
|
|
------
|
|
list of :class:`numpy.ndarray`
|
|
Topics matrix
|
|
|
|
"""
|
|
try:
|
|
return [
|
|
matutils.argsort(topic, topn=topn, reverse=True) for topic in
|
|
model.get_topics()
|
|
]
|
|
except AttributeError:
|
|
raise ValueError(
|
|
"This topic model is not currently supported. Supported topic models"
|
|
" should implement the `get_topics` method.")
|
|
|
|
def segment_topics(self):
|
|
"""Segment topic, alias for `self.measure.seg(self.topics)`.
|
|
|
|
Return
|
|
------
|
|
list of list of pair
|
|
Segmented topics.
|
|
|
|
"""
|
|
return self.measure.seg(self.topics)
|
|
|
|
def estimate_probabilities(self, segmented_topics=None):
|
|
"""Accumulate word occurrences and co-occurrences from texts or corpus using the optimal method for the chosen
|
|
coherence metric.
|
|
|
|
Notes
|
|
-----
|
|
This operation may take quite some time for the sliding window based coherence methods.
|
|
|
|
Parameters
|
|
----------
|
|
segmented_topics : list of list of pair, optional
|
|
Segmented topics, typically produced by :meth:`~gensim.models.coherencemodel.CoherenceModel.segment_topics`.
|
|
|
|
Return
|
|
------
|
|
:class:`~gensim.topic_coherence.text_analysis.CorpusAccumulator`
|
|
Corpus accumulator.
|
|
|
|
"""
|
|
if segmented_topics is None:
|
|
segmented_topics = self.segment_topics()
|
|
|
|
if self.coherence in BOOLEAN_DOCUMENT_BASED:
|
|
self._accumulator = self.measure.prob(self.corpus, segmented_topics)
|
|
else:
|
|
kwargs = dict(
|
|
texts=self.texts, segmented_topics=segmented_topics,
|
|
dictionary=self.dictionary, window_size=self.window_size,
|
|
processes=self.processes)
|
|
if self.coherence == 'c_w2v':
|
|
kwargs['model'] = self.keyed_vectors
|
|
|
|
self._accumulator = self.measure.prob(**kwargs)
|
|
|
|
return self._accumulator
|
|
|
|
def get_coherence_per_topic(self, segmented_topics=None, with_std=False, with_support=False):
|
|
"""Get list of coherence values for each topic based on pipeline parameters.
|
|
|
|
Parameters
|
|
----------
|
|
segmented_topics : list of list of (int, number)
|
|
Topics.
|
|
with_std : bool, optional
|
|
True to also include standard deviation across topic segment sets in addition to the mean coherence
|
|
for each topic.
|
|
with_support : bool, optional
|
|
True to also include support across topic segments. The support is defined as the number of pairwise
|
|
similarity comparisons were used to compute the overall topic coherence.
|
|
|
|
Return
|
|
------
|
|
list of float
|
|
Sequence of similarity measure for each topic.
|
|
|
|
"""
|
|
measure = self.measure
|
|
if segmented_topics is None:
|
|
segmented_topics = measure.seg(self.topics)
|
|
if self._accumulator is None:
|
|
self.estimate_probabilities(segmented_topics)
|
|
|
|
kwargs = dict(with_std=with_std, with_support=with_support)
|
|
if self.coherence in BOOLEAN_DOCUMENT_BASED or self.coherence == 'c_w2v':
|
|
pass
|
|
elif self.coherence == 'c_v':
|
|
kwargs['topics'] = self.topics
|
|
kwargs['measure'] = 'nlr'
|
|
kwargs['gamma'] = 1
|
|
else:
|
|
kwargs['normalize'] = (self.coherence == 'c_npmi')
|
|
|
|
return measure.conf(segmented_topics, self._accumulator, **kwargs)
|
|
|
|
def aggregate_measures(self, topic_coherences):
|
|
"""Aggregate the individual topic coherence measures using the pipeline's aggregation function.
|
|
Use `self.measure.aggr(topic_coherences)`.
|
|
|
|
Parameters
|
|
----------
|
|
topic_coherences : list of float
|
|
List of calculated confirmation measure on each set in the segmented topics.
|
|
|
|
Returns
|
|
-------
|
|
float
|
|
Arithmetic mean of all the values contained in confirmation measures.
|
|
|
|
"""
|
|
return self.measure.aggr(topic_coherences)
|
|
|
|
def get_coherence(self):
|
|
"""Get coherence value based on pipeline parameters.
|
|
|
|
Returns
|
|
-------
|
|
float
|
|
Value of coherence.
|
|
|
|
"""
|
|
confirmed_measures = self.get_coherence_per_topic()
|
|
return self.aggregate_measures(confirmed_measures)
|
|
|
|
def compare_models(self, models):
|
|
"""Compare topic models by coherence value.
|
|
|
|
Parameters
|
|
----------
|
|
models : :class:`~gensim.models.basemodel.BaseTopicModel`
|
|
Sequence of topic models.
|
|
|
|
Returns
|
|
-------
|
|
list of (float, float)
|
|
Sequence of pairs of average topic coherence and average coherence.
|
|
|
|
"""
|
|
model_topics = [self._get_topics_from_model(model, self.topn) for model in models]
|
|
return self.compare_model_topics(model_topics)
|
|
|
|
def compare_model_topics(self, model_topics):
|
|
"""Perform the coherence evaluation for each of the models.
|
|
|
|
Parameters
|
|
----------
|
|
model_topics : list of list of str
|
|
list of list of words for the model trained with that number of topics.
|
|
|
|
Returns
|
|
-------
|
|
list of (float, float)
|
|
Sequence of pairs of average topic coherence and average coherence.
|
|
|
|
Notes
|
|
-----
|
|
This first precomputes the probabilities once, then evaluates coherence for each model.
|
|
|
|
Since we have already precomputed the probabilities, this simply involves using the accumulated stats in the
|
|
:class:`~gensim.models.coherencemodel.CoherenceModel` to perform the evaluations, which should be pretty quick.
|
|
|
|
"""
|
|
orig_topics = self._topics
|
|
orig_topn = self.topn
|
|
|
|
try:
|
|
coherences = self._compare_model_topics(model_topics)
|
|
finally:
|
|
self.topics = orig_topics
|
|
self.topn = orig_topn
|
|
|
|
return coherences
|
|
|
|
def _compare_model_topics(self, model_topics):
|
|
"""Get average topic and model coherences.
|
|
|
|
Parameters
|
|
----------
|
|
model_topics : list of list of str
|
|
Topics from the model.
|
|
|
|
Returns
|
|
-------
|
|
list of (float, float)
|
|
Sequence of pairs of average topic coherence and average coherence.
|
|
|
|
"""
|
|
coherences = []
|
|
last_topn_value = min(self.topn - 1, 4)
|
|
topn_grid = list(range(self.topn, last_topn_value, -5))
|
|
|
|
for model_num, topics in enumerate(model_topics):
|
|
self.topics = topics
|
|
|
|
# We evaluate at various values of N and average them. This is a more robust,
|
|
# according to: http://people.eng.unimelb.edu.au/tbaldwin/pubs/naacl2016.pdf
|
|
coherence_at_n = {}
|
|
for n in topn_grid:
|
|
self.topn = n
|
|
topic_coherences = self.get_coherence_per_topic()
|
|
|
|
# Let's record the coherences for each topic, as well as the aggregated
|
|
# coherence across all of the topics.
|
|
# Some of them may be nan (if all words were OOV), so do mean value imputation.
|
|
filled_coherences = np.array(topic_coherences)
|
|
filled_coherences[np.isnan(filled_coherences)] = np.nanmean(filled_coherences)
|
|
coherence_at_n[n] = (topic_coherences, self.aggregate_measures(filled_coherences))
|
|
|
|
topic_coherences, avg_coherences = zip(*coherence_at_n.values())
|
|
avg_topic_coherences = np.vstack(topic_coherences).mean(0)
|
|
model_coherence = np.mean(avg_coherences)
|
|
logging.info("Avg coherence for model %d: %.5f" % (model_num, model_coherence))
|
|
coherences.append((avg_topic_coherences, model_coherence))
|
|
|
|
return coherences
|