#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2014 Artyom Topchyan # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html # Based on Copyright (C) 2014 Radim Rehurek """Python wrapper for `Dynamic Topic Models (DTM) `_ and the `Document Influence Model (DIM) `_. Installation ------------ You have 2 ways, how to make binaries: #. Use precompiled binaries for your OS version from `/magsilva/dtm/ `_ #. Compile binaries manually from `/blei-lab/dtm `_ (original instruction available in https://github.com/blei-lab/dtm/blob/master/README.md), or use this :: git clone https://github.com/blei-lab/dtm.git sudo apt-get install libgsl0-dev cd dtm/dtm make Examples -------- >>> from gensim.test.utils import common_corpus, common_dictionary >>> from gensim.models.wrappers import DtmModel >>> >>> path_to_dtm_binary = "/path/to/dtm/binary" >>> model = DtmModel( ... path_to_dtm_binary, corpus=common_corpus, id2word=common_dictionary, ... time_slices=[1] * len(common_corpus) ... ) """ import logging import random import warnings import tempfile import os from subprocess import PIPE import numpy as np from gensim import utils, corpora, matutils from gensim.utils import check_output logger = logging.getLogger(__name__) class DtmModel(utils.SaveLoad): """Python wrapper using `DTM implementation `_. Communication between DTM and Python takes place by passing around data files on disk and executing the DTM binary as a subprocess. Warnings -------- This is **only** python wrapper for `DTM implementation `_, you need to install original implementation first and pass the path to binary to ``dtm_path``. """ def __init__(self, dtm_path, corpus=None, time_slices=None, mode='fit', model='dtm', num_topics=100, id2word=None, prefix=None, lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10, alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=True): """ Parameters ---------- dtm_path : str Path to the dtm binary, e.g. `/home/username/dtm/dtm/main`. corpus : iterable of iterable of (int, int) Collection of texts in BoW format. time_slices : list of int Sequence of timestamps. mode : {'fit', 'time'}, optional Controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time according to a DTM, basically a held out set. model : {'fixed', 'dtm'}, optional Control model that will be runned: 'fixed' is for DIM and 'dtm' for DTM. num_topics : int, optional Number of topics. id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional Mapping between tokens ids and words from corpus, if not specified - will be inferred from `corpus`. prefix : str, optional Prefix for produced temporary files. lda_sequence_min_iter : int, optional Min iteration of LDA. lda_sequence_max_iter : int, optional Max iteration of LDA. lda_max_em_iter : int, optional Max em optimization iterations in LDA. alpha : int, optional Hyperparameter that affects sparsity of the document-topics for the LDA models in each timeslice. top_chain_var : int, optional Hyperparameter that affects. rng_seed : int, optional Random seed. initialize_lda : bool, optional If True - initialize DTM with LDA. """ if not os.path.isfile(dtm_path): raise ValueError("dtm_path must point to the binary file, not to a folder") self.dtm_path = dtm_path self.id2word = id2word if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys()) if self.num_terms == 0: raise ValueError("cannot compute DTM over an empty collection (no terms)") self.num_topics = num_topics try: lencorpus = len(corpus) except TypeError: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: raise ValueError("cannot compute DTM over an empty corpus") if model == "fixed" and any(not text for text in corpus): raise ValueError("""There is a text without words in the input corpus. This breaks method='fixed' (The DIM model).""") if lencorpus != sum(time_slices): raise ValueError( "mismatched timeslices %{slices} for corpus of len {clen}" .format(slices=sum(time_slices), clen=lencorpus) ) self.lencorpus = lencorpus if prefix is None: rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' prefix = os.path.join(tempfile.gettempdir(), rand_prefix) self.prefix = prefix self.time_slices = time_slices self.lda_sequence_min_iter = int(lda_sequence_min_iter) self.lda_sequence_max_iter = int(lda_sequence_max_iter) self.lda_max_em_iter = int(lda_max_em_iter) self.alpha = alpha self.top_chain_var = top_chain_var self.rng_seed = rng_seed self.initialize_lda = str(initialize_lda).lower() self.lambda_ = None self.obs_ = None self.lhood_ = None self.gamma_ = None self.init_alpha = None self.init_beta = None self.init_ss = None self.em_steps = [] self.influences_time = [] if corpus is not None: self.train(corpus, time_slices, mode, model) def fout_liklihoods(self): """Get path to temporary lhood data file. Returns ------- str Path to lhood data file. """ return self.prefix + 'train_out/lda-seq/' + 'lhoods.dat' def fout_gamma(self): """Get path to temporary gamma data file. Returns ------- str Path to gamma data file. """ return self.prefix + 'train_out/lda-seq/' + 'gam.dat' def fout_prob(self): """Get template of path to temporary file. Returns ------- str Path to file. """ return self.prefix + 'train_out/lda-seq/' + 'topic-{i}-var-e-log-prob.dat' def fout_observations(self): """Get template of path to temporary file. Returns ------- str Path to file. """ return self.prefix + 'train_out/lda-seq/' + 'topic-{i}-var-obs.dat' def fout_influence(self): """Get template of path to temporary file. Returns ------- str Path to file. """ return self.prefix + 'train_out/lda-seq/' + 'influence_time-{i}' def foutname(self): """Get path to temporary file. Returns ------- str Path to file. """ return self.prefix + 'train_out' def fem_steps(self): """Get path to temporary em_step data file. Returns ------- str Path to em_step data file. """ return self.prefix + 'train_out/' + 'em_log.dat' def finit_alpha(self): """Get path to initially trained lda alpha file. Returns ------- str Path to initially trained lda alpha file. """ return self.prefix + 'train_out/' + 'initial-lda.alpha' def finit_beta(self): """Get path to initially trained lda beta file. Returns ------- str Path to initially trained lda beta file. """ return self.prefix + 'train_out/' + 'initial-lda.beta' def flda_ss(self): """Get path to initial lda binary file. Returns ------- str Path to initial lda binary file. """ return self.prefix + 'train_out/' + 'initial-lda-ss.dat' def fcorpustxt(self): """Get path to temporary file. Returns ------- str Path to multiple train binary file. """ return self.prefix + 'train-mult.dat' def fcorpus(self): """Get path to corpus file. Returns ------- str Path to corpus file. """ return self.prefix + 'train' def ftimeslices(self): """Get path to time slices binary file. Returns ------- str Path to time slices binary file. """ return self.prefix + 'train-seq.dat' def convert_input(self, corpus, time_slices): """Convert corpus into LDA-C format by :class:`~gensim.corpora.bleicorpus.BleiCorpus` and save to temp file. Path to temporary file produced by :meth:`~gensim.models.wrappers.dtmmodel.DtmModel.ftimeslices`. Parameters ---------- corpus : iterable of iterable of (int, float) Corpus in BoW format. time_slices : list of int Sequence of timestamps. """ logger.info("serializing temporary corpus to %s", self.fcorpustxt()) # write out the corpus in a file format that DTM understands: corpora.BleiCorpus.save_corpus(self.fcorpustxt(), corpus) with utils.smart_open(self.ftimeslices(), 'wb') as fout: fout.write(utils.to_utf8(str(len(self.time_slices)) + "\n")) for sl in time_slices: fout.write(utils.to_utf8(str(sl) + "\n")) def train(self, corpus, time_slices, mode, model): """Train DTM model. Parameters ---------- corpus : iterable of iterable of (int, int) Collection of texts in BoW format. time_slices : list of int Sequence of timestamps. mode : {'fit', 'time'}, optional Controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time according to a DTM, basically a held out set. model : {'fixed', 'dtm'}, optional Control model that will be runned: 'fixed' is for DIM and 'dtm' for DTM. """ self.convert_input(corpus, time_slices) arguments = \ "--ntopics={p0} --model={mofrl} --mode={p1} --initialize_lda={p2} --corpus_prefix={p3} " \ "--outname={p4} --alpha={p5}".format( p0=self.num_topics, mofrl=model, p1=mode, p2=self.initialize_lda, p3=self.fcorpus(), p4=self.foutname(), p5=self.alpha ) params = \ "--lda_max_em_iter={p0} --lda_sequence_min_iter={p1} --lda_sequence_max_iter={p2} " \ "--top_chain_var={p3} --rng_seed={p4} ".format( p0=self.lda_max_em_iter, p1=self.lda_sequence_min_iter, p2=self.lda_sequence_max_iter, p3=self.top_chain_var, p4=self.rng_seed ) arguments = arguments + " " + params logger.info("training DTM with args %s", arguments) cmd = [self.dtm_path] + arguments.split() logger.info("Running command %s", cmd) check_output(args=cmd, stderr=PIPE) self.em_steps = np.loadtxt(self.fem_steps()) self.init_ss = np.loadtxt(self.flda_ss()) if self.initialize_lda: self.init_alpha = np.loadtxt(self.finit_alpha()) self.init_beta = np.loadtxt(self.finit_beta()) self.lhood_ = np.loadtxt(self.fout_liklihoods()) # document-topic proportions self.gamma_ = np.loadtxt(self.fout_gamma()) # cast to correct shape, gamme[5,10] is the proprtion of the 10th topic # in doc 5 self.gamma_.shape = (self.lencorpus, self.num_topics) # normalize proportions self.gamma_ /= self.gamma_.sum(axis=1)[:, np.newaxis] self.lambda_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices))) self.obs_ = np.zeros((self.num_topics, self.num_terms * len(self.time_slices))) for t in range(self.num_topics): topic = "%03d" % t self.lambda_[t, :] = np.loadtxt(self.fout_prob().format(i=topic)) self.obs_[t, :] = np.loadtxt(self.fout_observations().format(i=topic)) # cast to correct shape, lambda[5,10,0] is the proportion of the 10th # topic in doc 5 at time 0 self.lambda_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) self.obs_.shape = (self.num_topics, self.num_terms, len(self.time_slices)) # extract document influence on topics for each time slice # influences_time[0] , influences at time 0 if model == 'fixed': for k, t in enumerate(self.time_slices): stamp = "%03d" % k influence = np.loadtxt(self.fout_influence().format(i=stamp)) influence.shape = (t, self.num_topics) # influence[2,5] influence of document 2 on topic 5 self.influences_time.append(influence) def print_topics(self, num_topics=10, times=5, num_words=10): """Alias for :meth:`~gensim.models.wrappers.dtmmodel.DtmModel.show_topics`. Parameters ---------- num_topics : int, optional Number of topics to return, set `-1` to get all topics. times : int, optional Number of times. num_words : int, optional Number of words. Returns ------- list of str Topics as a list of strings """ return self.show_topics(num_topics, times, num_words, log=True) def show_topics(self, num_topics=10, times=5, num_words=10, log=False, formatted=True): """Get the `num_words` most probable words for `num_topics` number of topics at 'times' time slices. Parameters ---------- num_topics : int, optional Number of topics to return, set `-1` to get all topics. times : int, optional Number of times. num_words : int, optional Number of words. log : bool, optional THIS PARAMETER WILL BE IGNORED. formatted : bool, optional If `True` - return the topics as a list of strings, otherwise as lists of (weight, word) pairs. Returns ------- list of str Topics as a list of strings (if formatted=True) **OR** list of (float, str) Topics as list of (weight, word) pairs (if formatted=False) """ if num_topics < 0 or num_topics >= self.num_topics: num_topics = self.num_topics chosen_topics = range(num_topics) else: num_topics = min(num_topics, self.num_topics) chosen_topics = range(num_topics) if times < 0 or times >= len(self.time_slices): times = len(self.time_slices) chosen_times = range(times) else: times = min(times, len(self.time_slices)) chosen_times = range(times) shown = [] for time in chosen_times: for i in chosen_topics: if formatted: topic = self.print_topic(i, time, num_words=num_words) else: topic = self.show_topic(i, time, num_words=num_words) shown.append(topic) return shown def show_topic(self, topicid, time, topn=50, num_words=None): """Get `num_words` most probable words for the given `topicid`. Parameters ---------- topicid : int Id of topic. time : int Timestamp. topn : int, optional Top number of topics that you'll receive. num_words : int, optional DEPRECATED PARAMETER, use `topn` instead. Returns ------- list of (float, str) Sequence of probable words, as a list of `(word_probability, word)`. """ if num_words is not None: # deprecated num_words is used warnings.warn("The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead.") topn = num_words topics = self.lambda_[:, :, time] topic = topics[topicid] # likelihood to probability topic = np.exp(topic) # normalize to probability dist topic = topic / topic.sum() # sort according to prob bestn = matutils.argsort(topic, topn, reverse=True) beststr = [(topic[idx], self.id2word[idx]) for idx in bestn] return beststr def print_topic(self, topicid, time, topn=10, num_words=None): """Get the given topic, formatted as a string. Parameters ---------- topicid : int Id of topic. time : int Timestamp. topn : int, optional Top number of topics that you'll receive. num_words : int, optional DEPRECATED PARAMETER, use `topn` instead. Returns ------- str The given topic in string format, like '0.132*someword + 0.412*otherword + ...'. """ if num_words is not None: # deprecated num_words is used warnings.warn("The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead.") topn = num_words return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, topn)]) def dtm_vis(self, corpus, time): """Get data specified by pyLDAvis format. Parameters ---------- corpus : iterable of iterable of (int, float) Collection of texts in BoW format. time : int Sequence of timestamp. Notes ----- All of these are needed to visualise topics for DTM for a particular time-slice via pyLDAvis. Returns ------- doc_topic : numpy.ndarray Document-topic proportions. topic_term : numpy.ndarray Calculated term of topic suitable for pyLDAvis format. doc_lengths : list of int Length of each documents in corpus. term_frequency : numpy.ndarray Frequency of each word from vocab. vocab : list of str List of words from docpus. """ topic_term = np.exp(self.lambda_[:, :, time]) / np.exp(self.lambda_[:, :, time]).sum() topic_term *= self.num_topics doc_topic = self.gamma_ doc_lengths = [len(doc) for doc_no, doc in enumerate(corpus)] term_frequency = np.zeros(len(self.id2word)) for doc_no, doc in enumerate(corpus): for pair in doc: term_frequency[pair[0]] += pair[1] vocab = [self.id2word[i] for i in range(0, len(self.id2word))] # returns numpy arrays for doc_topic proportions, topic_term proportions, and document_lengths, term_frequency. # these should be passed to the `pyLDAvis.prepare` method to visualise one time-slice of DTM topics. return doc_topic, topic_term, doc_lengths, term_frequency, vocab def dtm_coherence(self, time, num_words=20): """Get all topics of a particular time-slice without probability values for it to be used. For either "u_mass" or "c_v" coherence. Parameters ---------- num_words : int Number of words. time : int Timestamp Returns ------- coherence_topics : list of list of str All topics of a particular time-slice without probability values for it to be used. Warnings -------- TODO: because of print format right now can only return for 1st time-slice, should we fix the coherence printing or make changes to the print statements to mirror DTM python? """ coherence_topics = [] for topic_no in range(0, self.num_topics): topic = self.show_topic(topicid=topic_no, time=time, num_words=num_words) coherence_topic = [] for prob, word in topic: coherence_topic.append(word) coherence_topics.append(coherence_topic) return coherence_topics