#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2015 Dave Challis # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """Python wrapper for `Vowpal Wabbit's Latent Dirichlet Allocation `_. This uses `Matt Hoffman's online algorithm `_, i.e. the same algorithm that Gensim's :class:`~gensim.models.ldamodel.LdaModel` is based on. Installation ------------ Use `official guide `_ or this one :: git clone https://github.com/JohnLangford/vowpal_wabbit.git cd vowpal_wabbit make make test sudo make install Warnings -------- Currently working and tested with Vowpal Wabbit versions 7.10 to 8.1.1. Vowpal Wabbit's API isn't currently stable, so this may or may not work with older/newer versions. The aim will be to ensure this wrapper always works with the latest release of Vowpal Wabbit. Examples -------- Train model >>> from gensim.test.utils import common_corpus, common_dictionary >>> from gensim.models.wrappers import LdaVowpalWabbit >>> >>> path_to_wv_binary = "/path/to/vw/binary" >>> model = LdaVowpalWabbit(path_to_wv_binary, corpus=common_corpus, num_topics=20, id2word=common_dictionary) Update existing model >>> another_corpus = [[(1, 1), (2, 1)], [(3, 5)]] >>> model.update(another_corpus) Get topic probability distributions for a document >>> document_bow = [(1, 1)] >>> print(model[document_bow]) Print topics >>> print(model.print_topics()) Save/load the trained model >>> from gensim.test.utils import get_tmpfile >>> >>> temp_path = get_tmpfile("vw_lda.model") >>> model.save(temp_path) >>> >>> loaded_lda = LdaVowpalWabbit.load(temp_path) Calculate log-perplexoty on given corpus >>> another_corpus = [[(1, 1), (2, 1)], [(3, 5)]] >>> print(model.log_perpexity(another_corpus)) Vowpal Wabbit works on files, so this wrapper maintains a temporary directory while it's around, reading/writing there as necessary. """ from __future__ import division from __future__ import print_function from __future__ import unicode_literals import logging import os import shutil import subprocess import tempfile import numpy from gensim import utils, matutils from gensim.models.ldamodel import LdaModel logger = logging.getLogger(__name__) class LdaVowpalWabbit(utils.SaveLoad): """Python wrapper using `Vowpal Wabbit's online LDA `_. Communication between Vowpal Wabbit and Python takes place by passing around data files on disk and calling the 'vw' binary with the subprocess module. Warnings -------- This is **only** python wrapper for `Vowpal Wabbit's online LDA `_, you need to install original implementation first and pass the path to binary to ``vw_path``. """ def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None, chunksize=256, passes=1, alpha=0.1, eta=0.1, decay=0.5, offset=1, gamma_threshold=0.001, random_seed=None, cleanup_files=True, tmp_prefix='tmp'): """ Parameters ---------- vw_path : str Path to Vowpal Wabbit's binary. corpus : iterable of list of (int, int), optional Collection of texts in BoW format. If given, training will start immediately, otherwise, you should call :meth:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit.train` or :meth:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit.update` manually for training. num_topics : int, optional Number of requested latent topics to be extracted from the training corpus. Corresponds to VW's ``--lda `` argument. id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional Mapping from word ids (integers) to words (strings). chunksize : int, optional Number of documents examined in each batch. Corresponds to VW's ``--minibatch `` argument. passes : int, optional Number of passes over the dataset to use. Corresponds to VW's ``--passes `` argument. alpha : float, optional Float effecting sparsity of per-document topic weights. This is applied symmetrically, and should be set higher to when documents are thought to look more similar. Corresponds to VW's ``--lda_alpha `` argument. eta : float, optional Affects the sparsity of topic distributions. This is applied symmetrically, and should be set higher when topics are thought to look more similar. Corresponds to VW's ``--lda_rho `` argument. decay : float, optional Learning rate decay, affects how quickly learnt values are forgotten. Should be set to a value between 0.5 and 1.0 to guarantee convergence. Corresponds to VW's ``--power_t `` argument. offset: int, optional Learning offset, set to higher values to slow down learning on early iterations of the algorithm. Corresponds to VW's ``--initial_t `` argument. gamma_threshold : float, optional Affects when learning loop will be broken out of, higher values will result in earlier loop completion. Corresponds to VW's ``--epsilon `` argument. random_seed : int, optional Sets random seed when learning. Corresponds to VW's ``--random_seed `` argument. cleanup_files : bool, optional Whether or not to delete temporary directory and files used by this wrapper. Setting to False can be useful for debugging, or for re-using Vowpal Wabbit files elsewhere. tmp_prefix : str, optional To prefix temporary working directory name. """ # default parameters are taken from Vowpal Wabbit's defaults, and # parameter names changed to match Gensim's LdaModel where possible self.vw_path = vw_path self.id2word = id2word if self.id2word is None: if corpus is None: raise ValueError( "at least one of corpus/id2word must be specified, to establish input space dimensionality" ) logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") # LDA parameters self.num_topics = num_topics self.chunksize = chunksize self.passes = passes self.alpha = alpha self.eta = eta self.gamma_threshold = gamma_threshold self.offset = offset self.decay = decay self.random_seed = random_seed self._initial_offset = offset # temporary files used for Vowpal Wabbit input/output self.tmp_dir = None self.tmp_prefix = tmp_prefix self.cleanup_files = cleanup_files self._init_temp_dir(tmp_prefix) # used for saving/loading this model's state self._model_data = None self._topics_data = None # cache loaded topics as numpy array self._topics = None if corpus is not None: self.train(corpus) def train(self, corpus): """Clear any existing model state, and train on given `corpus`. Parameters ---------- corpus : iterable of list of (int, int) Collection of texts in BoW format. """ logger.debug('Training new model from corpus') # reset any existing offset, model, or topics generated self.offset = self._initial_offset self._topics = None corpus_size = write_corpus_as_vw(corpus, self._corpus_filename) cmd = self._get_vw_train_command(corpus_size) _run_vw_command(cmd) # ensure that future updates of this model use correct offset self.offset += corpus_size def update(self, corpus): """Update existing model with `corpus`. Parameters ---------- corpus : iterable of list of (int, int) Collection of texts in BoW format. """ if not os.path.exists(self._model_filename): return self.train(corpus) logger.debug('Updating exiting model from corpus') # reset any existing topics generated self._topics = None corpus_size = write_corpus_as_vw(corpus, self._corpus_filename) cmd = self._get_vw_update_command(corpus_size) _run_vw_command(cmd) # ensure that future updates of this model use correct offset self.offset += corpus_size def log_perplexity(self, chunk): """Get per-word lower bound on log perplexity. Parameters ---------- chunk : iterable of list of (int, int) Collection of texts in BoW format. Returns ------- bound : float Per-word lower bound on log perplexity. """ vw_data = self._predict(chunk)[1] corpus_words = sum(cnt for document in chunk for _, cnt in document) bound = -vw_data['average_loss'] logger.info( "%.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words", bound, numpy.exp2(-bound), vw_data['corpus_size'], corpus_words ) return bound def get_topics(self): """Get topics X words matrix. Returns ------- numpy.ndarray `num_topics` x `vocabulary_size` array of floats which represents the learned term topic matrix. """ topics = self._get_topics() return topics / topics.sum(axis=1)[:, None] def print_topics(self, num_topics=10, num_words=10): """Alias for :meth:`~gensim.models.wrappers.dtmmodel.DtmModel.show_topics`. Parameters ---------- num_topics : int, optional Number of topics to return, set `-1` to get all topics. num_words : int, optional Number of words. Returns ------- list of str Topics as a list of strings """ return self.show_topics(num_topics, num_words, log=True) def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): """Get the `num_words` most probable words for `num_topics` number of topics. Parameters ---------- num_topics : int, optional Number of topics to return, set `-1` to get all topics. num_words : int, optional Number of words. log : bool, optional If True - will write topics with logger. formatted : bool, optional If `True` - return the topics as a list of strings, otherwise as lists of (weight, word) pairs. Returns ------- list of str Topics as a list of strings (if formatted=True) **OR** list of (float, str) Topics as list of (weight, word) pairs (if formatted=False) """ if num_topics < 0 or num_topics >= self.num_topics: num_topics = self.num_topics else: num_topics = min(num_topics, self.num_topics) chosen_topics = range(num_topics) shown = [] for i in chosen_topics: if formatted: topic = self.print_topic(i, topn=num_words) else: topic = self.show_topic(i, topn=num_words) shown.append(topic) if log: logger.info("topic #%i (%.3f): %s", i, self.alpha, topic) return shown def print_topic(self, topicid, topn=10): """Get text representation of topic. Parameters ---------- topicid : int Id of topic. topn : int, optional Top number of words in topic. Returns ------- str Topic `topicid` in text representation. """ return ' + '.join(['{0:.3f}*{1}'.format(v[0], v[1]) for v in self.show_topic(topicid, topn)]) def show_topic(self, topicid, topn=10): """Get `num_words` most probable words for the given `topicid`. Parameters ---------- topicid : int Id of topic. topn : int, optional Top number of topics that you'll receive. Returns ------- list of (str, float) Sequence of probable words, as a list of `(word, word_probability)` for `topicid` topic. """ topics = self._get_topics() topic = topics[topicid] bestn = matutils.argsort(topic, topn, reverse=True) return [(topic[t_id], self.id2word[t_id]) for t_id in bestn] def save(self, fname, *args, **kwargs): """Save model to file. Parameters ---------- fname : str Path to output file. """ if os.path.exists(self._model_filename): # Vowpal Wabbit uses its own binary model file, read this into # variable before serialising this object - keeps all data # self contained within a single serialised file logger.debug("Reading model bytes from '%s'", self._model_filename) with utils.smart_open(self._model_filename, 'rb') as fhandle: self._model_data = fhandle.read() if os.path.exists(self._topics_filename): logger.debug("Reading topic bytes from '%s'", self._topics_filename) with utils.smart_open(self._topics_filename, 'rb') as fhandle: self._topics_data = fhandle.read() if 'ignore' not in kwargs: kwargs['ignore'] = frozenset(['_topics', 'tmp_dir']) super(LdaVowpalWabbit, self).save(fname, *args, **kwargs) @classmethod def load(cls, fname, *args, **kwargs): """Load model from `fname`. Parameters ---------- fname : str Path to file with :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit`. """ lda_vw = super(LdaVowpalWabbit, cls).load(fname, *args, **kwargs) lda_vw._init_temp_dir(prefix=lda_vw.tmp_prefix) if lda_vw._model_data: # Vowpal Wabbit operates on its own binary model file - deserialise # to file at load time, making it immediately ready for use logger.debug("Writing model bytes to '%s'", lda_vw._model_filename) with utils.smart_open(lda_vw._model_filename, 'wb') as fhandle: fhandle.write(lda_vw._model_data) lda_vw._model_data = None # no need to keep in memory after this if lda_vw._topics_data: logger.debug("Writing topic bytes to '%s'", lda_vw._topics_filename) with utils.smart_open(lda_vw._topics_filename, 'wb') as fhandle: fhandle.write(lda_vw._topics_data) lda_vw._topics_data = None return lda_vw def __del__(self): """Cleanup the temporary directory used by this wrapper.""" if self.cleanup_files and self.tmp_dir: logger.debug("Recursively deleting: %s", self.tmp_dir) shutil.rmtree(self.tmp_dir) def _init_temp_dir(self, prefix='tmp'): """Create a working temporary directory with given prefix. Parameters ---------- prefix : str Prefix of the temporary directory. """ self.tmp_dir = tempfile.mkdtemp(prefix=prefix) logger.info('using %s as temp dir', self.tmp_dir) def _get_vw_predict_command(self, corpus_size): """Get list of command line arguments for running prediction. Parameters ---------- corpus_size : int Size of the corpus. """ cmd = [ self.vw_path, '--testonly', # don't update model with this data '--lda_D', str(corpus_size), '-i', self._model_filename, # load existing binary model '-d', self._corpus_filename, '--learning_rate', '0', # possibly not needed, but harmless '-p', self._predict_filename ] if self.random_seed is not None: cmd.extend(['--random_seed', str(self.random_seed)]) return cmd def _get_vw_train_command(self, corpus_size, update=False): """Get list of command line arguments for running model training. Parameters ---------- corpus_size : int Size of corpus. update : bool Set `True` to further train an existing model. Returns ------- list of str Sequence of all training parameters. """ cmd = [ self.vw_path, '-d', self._corpus_filename, '--power_t', str(self.decay), '--initial_t', str(self.offset), '--minibatch', str(self.chunksize), '--lda_D', str(corpus_size), '--passes', str(self.passes), '--cache_file', self._cache_filename, '--lda_epsilon', str(self.gamma_threshold), '--readable_model', self._topics_filename, '-k', # clear cache '-f', self._model_filename ] if update: cmd.extend(['-i', self._model_filename]) else: # these params are read from model file if updating cmd.extend([ '--lda', str(self.num_topics), '-b', str(_bit_length(self.num_terms)), '--lda_alpha', str(self.alpha), '--lda_rho', str(self.eta) ]) if self.random_seed is not None: cmd.extend(['--random_seed', str(self.random_seed)]) return cmd def _get_vw_update_command(self, corpus_size): """Get list of command line arguments to update a model. Alias for :meth:`~gensim.models.wrappers.dtmmodel.DtmModel._get_vw_train_command` Parameters ---------- corpus_size : int Size of the corpus. Returns ------- list of str Sequence of all training parameters. """ return self._get_vw_train_command(corpus_size, update=True) def _load_vw_topics(self): """Read topics file generated by Vowpal Wabbit, convert to numpy array.""" topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32) with utils.smart_open(self._topics_filename) as topics_file: found_data = False for line in topics_file: # look for start of data if not found_data: if line.startswith(b'0 ') and b':' not in line: found_data = True else: continue fields = line.split() word_id = int(fields[0]) # output contains entries for 2**b terms, where b was set # by the '-b' option, ignore anything past num_terms if word_id >= self.num_terms: break topics[:, word_id] = fields[1:] # normalise to probability distribution self._topics = topics / topics.sum(axis=1, keepdims=True) def _get_topics(self): """Get topics matrix, load from file if necessary.""" if self._topics is None: self._load_vw_topics() return self._topics def _predict(self, chunk): """Run given chunk of documents against currently trained model. Parameters ---------- chunk : iterable of list of (int, int) Sequence of documents in BoW format. Returns ------- predictions : ndarray Tuple of prediction matrix. vw_data : dict Vowpal Wabbit data. """ corpus_size = write_corpus_as_vw(chunk, self._corpus_filename) cmd = self._get_vw_predict_command(corpus_size) vw_data = _parse_vw_output(_run_vw_command(cmd)) vw_data['corpus_size'] = corpus_size predictions = numpy.zeros((corpus_size, self.num_topics), dtype=numpy.float32) with utils.smart_open(self._predict_filename) as fhandle: for i, line in enumerate(fhandle): predictions[i, :] = line.split() predictions = predictions / predictions.sum(axis=1, keepdims=True) return predictions, vw_data def __getitem__(self, bow, eps=0.01): """Convert document or corpus in BoW format to LDA vectors in BoW format Parameters ---------- bow : {list of (int, int), iterable of list of (int, int)} Document or corpus in BoW format. eps : float Threshold value (all topics with probability < `eps` will be ignored. Returns ------- list of (int, float) LDA vector for document **OR** list of list of (int, float) LDA vectors for corpus. """ is_corpus, dummy_corpus = utils.is_corpus(bow) if not is_corpus: bow = [bow] predictions = self._predict(bow)[0] topics = [] for row in predictions: row_topics = [] for topic_id, val in enumerate(row): if val > eps: row_topics.append((topic_id, val)) topics.append(row_topics) return topics if is_corpus else topics[0] def _get_filename(self, name): """Get path to given filename in temp directory. Parameters ---------- name : str Name of the file. Returns ------- str Path to a file. """ return os.path.join(self.tmp_dir, name) @property def _model_filename(self): """Get path to file to write Vowpal Wabbit model to. Returns ------- str Path to file to write Vowpal Wabbit model to. """ return self._get_filename('model.vw') @property def _cache_filename(self): """Get path to file to write Vowpal Wabbit cache to. Returns ------- str Path to file to write Vowpal Wabbit cache to. """ return self._get_filename('cache.vw') @property def _corpus_filename(self): """Get path to file to write Vowpal Wabbit corpus to. Returns ------- str Path to file to write Vowpal Wabbit corpus to. """ return self._get_filename('corpus.vw') @property def _topics_filename(self): """Get path to file to write Vowpal Wabbit topics to. Returns ------- str Path to file to write Vowpal Wabbit topics to. """ return self._get_filename('topics.vw') @property def _predict_filename(self): """Get path to file to write Vowpal Wabbit predictions to. Returns ------- str Path to file to write Vowpal Wabbit predictions to. """ return self._get_filename('predict.vw') def __str__(self): """Get text representation of model.""" fields = ['num_terms', 'num_topics', 'chunksize', 'alpha', 'eta'] kv = ["{0}={1}".format(field, getattr(self, field)) for field in fields] return "{0}({1})".format(self.__class__.__name__, ', '.join(kv)) def corpus_to_vw(corpus): """Convert corpus to Vowpal Wabbit format. Parameters ---------- corpus : iterable of list of (int, int) Collection of texts in BoW format. Notes ----- Vowpal Wabbit format :: | 4:7 14:1 22:8 6:3 | 14:22 22:4 0:1 1:3 | 7:2 8:2 Yields ------ str Corpus in Vowpal Wabbit, line by line. """ for entries in corpus: line = ['|'] for word_id, count in entries: line.append("{0}:{1}".format(word_id, count)) yield ' '.join(line) def write_corpus_as_vw(corpus, filename): """Covert `corpus` to Vowpal Wabbit format and save it to `filename`. Parameters ---------- corpus : iterable of list of (int, int) Collection of texts in BoW format. filename : str Path to output file. Returns ------- int Number of lines in `filename`. """ logger.debug("Writing corpus to: %s", filename) corpus_size = 0 with utils.smart_open(filename, 'wb') as corpus_file: for line in corpus_to_vw(corpus): corpus_file.write(line.encode('utf-8') + b'\n') corpus_size += 1 return corpus_size def _parse_vw_output(text): """Get dict of useful fields from Vowpal Wabbit's output. Parameters ---------- text : str Text from vw file. Returns ------- dict of (str, float) Dictionary with field "average_loss", lower bound on mean per-word log-perplexity. """ data = {} for line in text.splitlines(): if line.startswith('average loss'): data['average_loss'] = float(line.split('=')[1]) break return data def _run_vw_command(cmd): """Execute given Vowpal Wabbit command, log stdout and stderr. Parameters ---------- cmd : str Given Vowpal Wabbit command to execute. Returns ------- str Stdout and stderr. Raises ------ subprocess.CalledProcessError If something goes wrong. """ logger.info("Running Vowpal Wabbit command: %s", ' '.join(cmd)) proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) output = proc.communicate()[0].decode('utf-8') logger.debug("Vowpal Wabbit output: %s", output) if proc.returncode != 0: raise subprocess.CalledProcessError(proc.returncode, ' '.join(cmd), output=output) return output # if python2.6 support is ever dropped, can change to using int.bit_length() def _bit_length(num): """Get number of bits needed to encode given number. Parameters ---------- num : int Number to encode. Returns ------- int Number of bits needed to encode given number. """ return len(bin(num).lstrip('-0b')) def vwmodel2ldamodel(vw_model, iterations=50): """Convert :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` to :class:`~gensim.models.ldamodel.LdaModel`. This works by simply copying the training model weights (alpha, beta...) from a trained vwmodel into the gensim model. Parameters ---------- vw_model : :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` Trained Vowpal Wabbit model. iterations : int Number of iterations to be used for inference of the new :class:`~gensim.models.ldamodel.LdaModel`. Returns ------- :class:`~gensim.models.ldamodel.LdaModel`. Gensim native LDA. """ model_gensim = LdaModel( num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize, passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay, offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold, dtype=numpy.float32 ) model_gensim.expElogbeta[:] = vw_model._get_topics() return model_gensim