895 lines
28 KiB
Python
895 lines
28 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
#
|
||
|
# Copyright (C) 2015 Dave Challis <dave@suicas.net>
|
||
|
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
|
||
|
|
||
|
"""Python wrapper for `Vowpal Wabbit's Latent Dirichlet Allocation <https://github.com/JohnLangford/vowpal_wabbit/>`_.
|
||
|
|
||
|
This uses `Matt Hoffman's online algorithm
|
||
|
<http://papers.nips.cc/paper/3902-online-learning-for-latent-dirichlet-allocation.pdf>`_, i.e. the same algorithm
|
||
|
that Gensim's :class:`~gensim.models.ldamodel.LdaModel` is based on.
|
||
|
|
||
|
Installation
|
||
|
------------
|
||
|
Use `official guide <https://github.com/JohnLangford/vowpal_wabbit>`_ or this one ::
|
||
|
|
||
|
git clone https://github.com/JohnLangford/vowpal_wabbit.git
|
||
|
cd vowpal_wabbit
|
||
|
make
|
||
|
make test
|
||
|
sudo make install
|
||
|
|
||
|
Warnings
|
||
|
--------
|
||
|
Currently working and tested with Vowpal Wabbit versions 7.10 to 8.1.1. Vowpal Wabbit's API isn't currently stable,
|
||
|
so this may or may not work with older/newer versions. The aim will be to ensure this wrapper always works with
|
||
|
the latest release of Vowpal Wabbit.
|
||
|
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
|
||
|
Train model
|
||
|
|
||
|
>>> from gensim.test.utils import common_corpus, common_dictionary
|
||
|
>>> from gensim.models.wrappers import LdaVowpalWabbit
|
||
|
>>>
|
||
|
>>> path_to_wv_binary = "/path/to/vw/binary"
|
||
|
>>> model = LdaVowpalWabbit(path_to_wv_binary, corpus=common_corpus, num_topics=20, id2word=common_dictionary)
|
||
|
|
||
|
Update existing model
|
||
|
|
||
|
>>> another_corpus = [[(1, 1), (2, 1)], [(3, 5)]]
|
||
|
>>> model.update(another_corpus)
|
||
|
|
||
|
Get topic probability distributions for a document
|
||
|
|
||
|
>>> document_bow = [(1, 1)]
|
||
|
>>> print(model[document_bow])
|
||
|
|
||
|
Print topics
|
||
|
|
||
|
>>> print(model.print_topics())
|
||
|
|
||
|
Save/load the trained model
|
||
|
|
||
|
>>> from gensim.test.utils import get_tmpfile
|
||
|
>>>
|
||
|
>>> temp_path = get_tmpfile("vw_lda.model")
|
||
|
>>> model.save(temp_path)
|
||
|
>>>
|
||
|
>>> loaded_lda = LdaVowpalWabbit.load(temp_path)
|
||
|
|
||
|
Calculate log-perplexoty on given corpus
|
||
|
|
||
|
>>> another_corpus = [[(1, 1), (2, 1)], [(3, 5)]]
|
||
|
>>> print(model.log_perpexity(another_corpus))
|
||
|
|
||
|
Vowpal Wabbit works on files, so this wrapper maintains a temporary directory while it's around,
|
||
|
reading/writing there as necessary.
|
||
|
|
||
|
"""
|
||
|
from __future__ import division
|
||
|
from __future__ import print_function
|
||
|
from __future__ import unicode_literals
|
||
|
|
||
|
import logging
|
||
|
import os
|
||
|
import shutil
|
||
|
import subprocess
|
||
|
import tempfile
|
||
|
|
||
|
import numpy
|
||
|
|
||
|
from gensim import utils, matutils
|
||
|
from gensim.models.ldamodel import LdaModel
|
||
|
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
|
||
|
class LdaVowpalWabbit(utils.SaveLoad):
|
||
|
"""Python wrapper using `Vowpal Wabbit's online LDA <https://github.com/JohnLangford/vowpal_wabbit/>`_.
|
||
|
|
||
|
Communication between Vowpal Wabbit and Python takes place by passing around data files
|
||
|
on disk and calling the 'vw' binary with the subprocess module.
|
||
|
|
||
|
Warnings
|
||
|
--------
|
||
|
This is **only** python wrapper for `Vowpal Wabbit's online LDA <https://github.com/JohnLangford/vowpal_wabbit/>`_,
|
||
|
you need to install original implementation first and pass the path to binary to ``vw_path``.
|
||
|
|
||
|
"""
|
||
|
def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None,
|
||
|
chunksize=256, passes=1, alpha=0.1, eta=0.1, decay=0.5,
|
||
|
offset=1, gamma_threshold=0.001, random_seed=None,
|
||
|
cleanup_files=True, tmp_prefix='tmp'):
|
||
|
"""
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
vw_path : str
|
||
|
Path to Vowpal Wabbit's binary.
|
||
|
corpus : iterable of list of (int, int), optional
|
||
|
Collection of texts in BoW format. If given, training will start immediately,
|
||
|
otherwise, you should call :meth:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit.train` or
|
||
|
:meth:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit.update` manually for training.
|
||
|
num_topics : int, optional
|
||
|
Number of requested latent topics to be extracted from the training corpus.
|
||
|
Corresponds to VW's ``--lda <num_topics>`` argument.
|
||
|
id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
|
||
|
Mapping from word ids (integers) to words (strings).
|
||
|
chunksize : int, optional
|
||
|
Number of documents examined in each batch.
|
||
|
Corresponds to VW's ``--minibatch <batch_size>`` argument.
|
||
|
passes : int, optional
|
||
|
Number of passes over the dataset to use.
|
||
|
Corresponds to VW's ``--passes <passes>`` argument.
|
||
|
alpha : float, optional
|
||
|
Float effecting sparsity of per-document topic weights.
|
||
|
This is applied symmetrically, and should be set higher to when documents are thought to look more similar.
|
||
|
Corresponds to VW's ``--lda_alpha <alpha>`` argument.
|
||
|
eta : float, optional
|
||
|
Affects the sparsity of topic distributions.
|
||
|
This is applied symmetrically, and should be set higher when topics
|
||
|
are thought to look more similar.
|
||
|
Corresponds to VW's ``--lda_rho <rho>`` argument.
|
||
|
decay : float, optional
|
||
|
Learning rate decay, affects how quickly learnt values are forgotten.
|
||
|
Should be set to a value between 0.5 and 1.0 to guarantee convergence.
|
||
|
Corresponds to VW's ``--power_t <tau>`` argument.
|
||
|
offset: int, optional
|
||
|
Learning offset, set to higher values to slow down learning on early iterations of the algorithm.
|
||
|
Corresponds to VW's ``--initial_t <tau>`` argument.
|
||
|
gamma_threshold : float, optional
|
||
|
Affects when learning loop will be broken out of, higher values will result in earlier loop completion.
|
||
|
Corresponds to VW's ``--epsilon <eps>`` argument.
|
||
|
random_seed : int, optional
|
||
|
Sets random seed when learning.
|
||
|
Corresponds to VW's ``--random_seed <seed>`` argument.
|
||
|
cleanup_files : bool, optional
|
||
|
Whether or not to delete temporary directory and files used by this wrapper.
|
||
|
Setting to False can be useful for debugging, or for re-using Vowpal Wabbit files elsewhere.
|
||
|
tmp_prefix : str, optional
|
||
|
To prefix temporary working directory name.
|
||
|
|
||
|
"""
|
||
|
# default parameters are taken from Vowpal Wabbit's defaults, and
|
||
|
# parameter names changed to match Gensim's LdaModel where possible
|
||
|
self.vw_path = vw_path
|
||
|
self.id2word = id2word
|
||
|
|
||
|
if self.id2word is None:
|
||
|
if corpus is None:
|
||
|
raise ValueError(
|
||
|
"at least one of corpus/id2word must be specified, to establish input space dimensionality"
|
||
|
)
|
||
|
logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
|
||
|
self.id2word = utils.dict_from_corpus(corpus)
|
||
|
self.num_terms = len(self.id2word)
|
||
|
elif len(self.id2word) > 0:
|
||
|
self.num_terms = 1 + max(self.id2word.keys())
|
||
|
else:
|
||
|
self.num_terms = 0
|
||
|
|
||
|
if self.num_terms == 0:
|
||
|
raise ValueError("cannot compute LDA over an empty collection (no terms)")
|
||
|
|
||
|
# LDA parameters
|
||
|
self.num_topics = num_topics
|
||
|
self.chunksize = chunksize
|
||
|
self.passes = passes
|
||
|
self.alpha = alpha
|
||
|
self.eta = eta
|
||
|
self.gamma_threshold = gamma_threshold
|
||
|
self.offset = offset
|
||
|
self.decay = decay
|
||
|
self.random_seed = random_seed
|
||
|
self._initial_offset = offset
|
||
|
|
||
|
# temporary files used for Vowpal Wabbit input/output
|
||
|
self.tmp_dir = None
|
||
|
self.tmp_prefix = tmp_prefix
|
||
|
self.cleanup_files = cleanup_files
|
||
|
self._init_temp_dir(tmp_prefix)
|
||
|
|
||
|
# used for saving/loading this model's state
|
||
|
self._model_data = None
|
||
|
self._topics_data = None
|
||
|
|
||
|
# cache loaded topics as numpy array
|
||
|
self._topics = None
|
||
|
|
||
|
if corpus is not None:
|
||
|
self.train(corpus)
|
||
|
|
||
|
def train(self, corpus):
|
||
|
"""Clear any existing model state, and train on given `corpus`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
corpus : iterable of list of (int, int)
|
||
|
Collection of texts in BoW format.
|
||
|
|
||
|
"""
|
||
|
logger.debug('Training new model from corpus')
|
||
|
|
||
|
# reset any existing offset, model, or topics generated
|
||
|
self.offset = self._initial_offset
|
||
|
self._topics = None
|
||
|
|
||
|
corpus_size = write_corpus_as_vw(corpus, self._corpus_filename)
|
||
|
|
||
|
cmd = self._get_vw_train_command(corpus_size)
|
||
|
|
||
|
_run_vw_command(cmd)
|
||
|
|
||
|
# ensure that future updates of this model use correct offset
|
||
|
self.offset += corpus_size
|
||
|
|
||
|
def update(self, corpus):
|
||
|
"""Update existing model with `corpus`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
corpus : iterable of list of (int, int)
|
||
|
Collection of texts in BoW format.
|
||
|
|
||
|
"""
|
||
|
if not os.path.exists(self._model_filename):
|
||
|
return self.train(corpus)
|
||
|
|
||
|
logger.debug('Updating exiting model from corpus')
|
||
|
|
||
|
# reset any existing topics generated
|
||
|
self._topics = None
|
||
|
|
||
|
corpus_size = write_corpus_as_vw(corpus, self._corpus_filename)
|
||
|
|
||
|
cmd = self._get_vw_update_command(corpus_size)
|
||
|
|
||
|
_run_vw_command(cmd)
|
||
|
|
||
|
# ensure that future updates of this model use correct offset
|
||
|
self.offset += corpus_size
|
||
|
|
||
|
def log_perplexity(self, chunk):
|
||
|
"""Get per-word lower bound on log perplexity.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
chunk : iterable of list of (int, int)
|
||
|
Collection of texts in BoW format.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
bound : float
|
||
|
Per-word lower bound on log perplexity.
|
||
|
|
||
|
"""
|
||
|
vw_data = self._predict(chunk)[1]
|
||
|
corpus_words = sum(cnt for document in chunk for _, cnt in document)
|
||
|
bound = -vw_data['average_loss']
|
||
|
logger.info(
|
||
|
"%.3f per-word bound, %.1f perplexity estimate based on a held-out corpus of %i documents with %i words",
|
||
|
bound, numpy.exp2(-bound), vw_data['corpus_size'], corpus_words
|
||
|
)
|
||
|
return bound
|
||
|
|
||
|
def get_topics(self):
|
||
|
"""Get topics X words matrix.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
numpy.ndarray
|
||
|
`num_topics` x `vocabulary_size` array of floats which represents the learned term topic matrix.
|
||
|
|
||
|
"""
|
||
|
topics = self._get_topics()
|
||
|
return topics / topics.sum(axis=1)[:, None]
|
||
|
|
||
|
def print_topics(self, num_topics=10, num_words=10):
|
||
|
"""Alias for :meth:`~gensim.models.wrappers.dtmmodel.DtmModel.show_topics`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
num_topics : int, optional
|
||
|
Number of topics to return, set `-1` to get all topics.
|
||
|
num_words : int, optional
|
||
|
Number of words.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of str
|
||
|
Topics as a list of strings
|
||
|
|
||
|
"""
|
||
|
return self.show_topics(num_topics, num_words, log=True)
|
||
|
|
||
|
def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
|
||
|
"""Get the `num_words` most probable words for `num_topics` number of topics.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
num_topics : int, optional
|
||
|
Number of topics to return, set `-1` to get all topics.
|
||
|
num_words : int, optional
|
||
|
Number of words.
|
||
|
log : bool, optional
|
||
|
If True - will write topics with logger.
|
||
|
formatted : bool, optional
|
||
|
If `True` - return the topics as a list of strings, otherwise as lists of (weight, word) pairs.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of str
|
||
|
Topics as a list of strings (if formatted=True) **OR**
|
||
|
list of (float, str)
|
||
|
Topics as list of (weight, word) pairs (if formatted=False)
|
||
|
|
||
|
"""
|
||
|
if num_topics < 0 or num_topics >= self.num_topics:
|
||
|
num_topics = self.num_topics
|
||
|
else:
|
||
|
num_topics = min(num_topics, self.num_topics)
|
||
|
|
||
|
chosen_topics = range(num_topics)
|
||
|
shown = []
|
||
|
|
||
|
for i in chosen_topics:
|
||
|
if formatted:
|
||
|
topic = self.print_topic(i, topn=num_words)
|
||
|
else:
|
||
|
topic = self.show_topic(i, topn=num_words)
|
||
|
|
||
|
shown.append(topic)
|
||
|
|
||
|
if log:
|
||
|
logger.info("topic #%i (%.3f): %s", i, self.alpha, topic)
|
||
|
|
||
|
return shown
|
||
|
|
||
|
def print_topic(self, topicid, topn=10):
|
||
|
"""Get text representation of topic.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
topicid : int
|
||
|
Id of topic.
|
||
|
topn : int, optional
|
||
|
Top number of words in topic.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
str
|
||
|
Topic `topicid` in text representation.
|
||
|
|
||
|
"""
|
||
|
return ' + '.join(['{0:.3f}*{1}'.format(v[0], v[1]) for v in self.show_topic(topicid, topn)])
|
||
|
|
||
|
def show_topic(self, topicid, topn=10):
|
||
|
"""Get `num_words` most probable words for the given `topicid`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
topicid : int
|
||
|
Id of topic.
|
||
|
topn : int, optional
|
||
|
Top number of topics that you'll receive.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of (str, float)
|
||
|
Sequence of probable words, as a list of `(word, word_probability)` for `topicid` topic.
|
||
|
|
||
|
"""
|
||
|
topics = self._get_topics()
|
||
|
topic = topics[topicid]
|
||
|
bestn = matutils.argsort(topic, topn, reverse=True)
|
||
|
return [(topic[t_id], self.id2word[t_id]) for t_id in bestn]
|
||
|
|
||
|
def save(self, fname, *args, **kwargs):
|
||
|
"""Save model to file.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
fname : str
|
||
|
Path to output file.
|
||
|
|
||
|
"""
|
||
|
if os.path.exists(self._model_filename):
|
||
|
# Vowpal Wabbit uses its own binary model file, read this into
|
||
|
# variable before serialising this object - keeps all data
|
||
|
# self contained within a single serialised file
|
||
|
logger.debug("Reading model bytes from '%s'", self._model_filename)
|
||
|
with utils.smart_open(self._model_filename, 'rb') as fhandle:
|
||
|
self._model_data = fhandle.read()
|
||
|
|
||
|
if os.path.exists(self._topics_filename):
|
||
|
logger.debug("Reading topic bytes from '%s'", self._topics_filename)
|
||
|
with utils.smart_open(self._topics_filename, 'rb') as fhandle:
|
||
|
self._topics_data = fhandle.read()
|
||
|
|
||
|
if 'ignore' not in kwargs:
|
||
|
kwargs['ignore'] = frozenset(['_topics', 'tmp_dir'])
|
||
|
|
||
|
super(LdaVowpalWabbit, self).save(fname, *args, **kwargs)
|
||
|
|
||
|
@classmethod
|
||
|
def load(cls, fname, *args, **kwargs):
|
||
|
"""Load model from `fname`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
fname : str
|
||
|
Path to file with :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit`.
|
||
|
|
||
|
"""
|
||
|
lda_vw = super(LdaVowpalWabbit, cls).load(fname, *args, **kwargs)
|
||
|
lda_vw._init_temp_dir(prefix=lda_vw.tmp_prefix)
|
||
|
|
||
|
if lda_vw._model_data:
|
||
|
# Vowpal Wabbit operates on its own binary model file - deserialise
|
||
|
# to file at load time, making it immediately ready for use
|
||
|
logger.debug("Writing model bytes to '%s'", lda_vw._model_filename)
|
||
|
with utils.smart_open(lda_vw._model_filename, 'wb') as fhandle:
|
||
|
fhandle.write(lda_vw._model_data)
|
||
|
lda_vw._model_data = None # no need to keep in memory after this
|
||
|
|
||
|
if lda_vw._topics_data:
|
||
|
logger.debug("Writing topic bytes to '%s'", lda_vw._topics_filename)
|
||
|
with utils.smart_open(lda_vw._topics_filename, 'wb') as fhandle:
|
||
|
fhandle.write(lda_vw._topics_data)
|
||
|
lda_vw._topics_data = None
|
||
|
|
||
|
return lda_vw
|
||
|
|
||
|
def __del__(self):
|
||
|
"""Cleanup the temporary directory used by this wrapper."""
|
||
|
if self.cleanup_files and self.tmp_dir:
|
||
|
logger.debug("Recursively deleting: %s", self.tmp_dir)
|
||
|
shutil.rmtree(self.tmp_dir)
|
||
|
|
||
|
def _init_temp_dir(self, prefix='tmp'):
|
||
|
"""Create a working temporary directory with given prefix.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
prefix : str
|
||
|
Prefix of the temporary directory.
|
||
|
|
||
|
"""
|
||
|
self.tmp_dir = tempfile.mkdtemp(prefix=prefix)
|
||
|
logger.info('using %s as temp dir', self.tmp_dir)
|
||
|
|
||
|
def _get_vw_predict_command(self, corpus_size):
|
||
|
"""Get list of command line arguments for running prediction.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
corpus_size : int
|
||
|
Size of the corpus.
|
||
|
|
||
|
"""
|
||
|
cmd = [
|
||
|
self.vw_path,
|
||
|
'--testonly', # don't update model with this data
|
||
|
'--lda_D', str(corpus_size),
|
||
|
'-i', self._model_filename, # load existing binary model
|
||
|
'-d', self._corpus_filename,
|
||
|
'--learning_rate', '0', # possibly not needed, but harmless
|
||
|
'-p', self._predict_filename
|
||
|
]
|
||
|
|
||
|
if self.random_seed is not None:
|
||
|
cmd.extend(['--random_seed', str(self.random_seed)])
|
||
|
|
||
|
return cmd
|
||
|
|
||
|
def _get_vw_train_command(self, corpus_size, update=False):
|
||
|
"""Get list of command line arguments for running model training.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
corpus_size : int
|
||
|
Size of corpus.
|
||
|
update : bool
|
||
|
Set `True` to further train an existing model.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of str
|
||
|
Sequence of all training parameters.
|
||
|
|
||
|
"""
|
||
|
cmd = [
|
||
|
self.vw_path,
|
||
|
'-d', self._corpus_filename,
|
||
|
'--power_t', str(self.decay),
|
||
|
'--initial_t', str(self.offset),
|
||
|
'--minibatch', str(self.chunksize),
|
||
|
'--lda_D', str(corpus_size),
|
||
|
'--passes', str(self.passes),
|
||
|
'--cache_file', self._cache_filename,
|
||
|
'--lda_epsilon', str(self.gamma_threshold),
|
||
|
'--readable_model', self._topics_filename,
|
||
|
'-k', # clear cache
|
||
|
'-f', self._model_filename
|
||
|
]
|
||
|
|
||
|
if update:
|
||
|
cmd.extend(['-i', self._model_filename])
|
||
|
else:
|
||
|
# these params are read from model file if updating
|
||
|
cmd.extend([
|
||
|
'--lda', str(self.num_topics),
|
||
|
'-b', str(_bit_length(self.num_terms)),
|
||
|
'--lda_alpha', str(self.alpha),
|
||
|
'--lda_rho', str(self.eta)
|
||
|
])
|
||
|
|
||
|
if self.random_seed is not None:
|
||
|
cmd.extend(['--random_seed', str(self.random_seed)])
|
||
|
|
||
|
return cmd
|
||
|
|
||
|
def _get_vw_update_command(self, corpus_size):
|
||
|
"""Get list of command line arguments to update a model.
|
||
|
Alias for :meth:`~gensim.models.wrappers.dtmmodel.DtmModel._get_vw_train_command`
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
corpus_size : int
|
||
|
Size of the corpus.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of str
|
||
|
Sequence of all training parameters.
|
||
|
|
||
|
"""
|
||
|
return self._get_vw_train_command(corpus_size, update=True)
|
||
|
|
||
|
def _load_vw_topics(self):
|
||
|
"""Read topics file generated by Vowpal Wabbit, convert to numpy array."""
|
||
|
topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32)
|
||
|
|
||
|
with utils.smart_open(self._topics_filename) as topics_file:
|
||
|
found_data = False
|
||
|
|
||
|
for line in topics_file:
|
||
|
# look for start of data
|
||
|
if not found_data:
|
||
|
if line.startswith(b'0 ') and b':' not in line:
|
||
|
found_data = True
|
||
|
else:
|
||
|
continue
|
||
|
|
||
|
fields = line.split()
|
||
|
word_id = int(fields[0])
|
||
|
|
||
|
# output contains entries for 2**b terms, where b was set
|
||
|
# by the '-b' option, ignore anything past num_terms
|
||
|
if word_id >= self.num_terms:
|
||
|
break
|
||
|
|
||
|
topics[:, word_id] = fields[1:]
|
||
|
|
||
|
# normalise to probability distribution
|
||
|
self._topics = topics / topics.sum(axis=1, keepdims=True)
|
||
|
|
||
|
def _get_topics(self):
|
||
|
"""Get topics matrix, load from file if necessary."""
|
||
|
if self._topics is None:
|
||
|
self._load_vw_topics()
|
||
|
return self._topics
|
||
|
|
||
|
def _predict(self, chunk):
|
||
|
"""Run given chunk of documents against currently trained model.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
chunk : iterable of list of (int, int)
|
||
|
Sequence of documents in BoW format.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
predictions : ndarray
|
||
|
Tuple of prediction matrix.
|
||
|
vw_data : dict
|
||
|
Vowpal Wabbit data.
|
||
|
|
||
|
"""
|
||
|
corpus_size = write_corpus_as_vw(chunk, self._corpus_filename)
|
||
|
|
||
|
cmd = self._get_vw_predict_command(corpus_size)
|
||
|
vw_data = _parse_vw_output(_run_vw_command(cmd))
|
||
|
vw_data['corpus_size'] = corpus_size
|
||
|
|
||
|
predictions = numpy.zeros((corpus_size, self.num_topics), dtype=numpy.float32)
|
||
|
|
||
|
with utils.smart_open(self._predict_filename) as fhandle:
|
||
|
for i, line in enumerate(fhandle):
|
||
|
predictions[i, :] = line.split()
|
||
|
|
||
|
predictions = predictions / predictions.sum(axis=1, keepdims=True)
|
||
|
|
||
|
return predictions, vw_data
|
||
|
|
||
|
def __getitem__(self, bow, eps=0.01):
|
||
|
"""Convert document or corpus in BoW format to LDA vectors in BoW format
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
bow : {list of (int, int), iterable of list of (int, int)}
|
||
|
Document or corpus in BoW format.
|
||
|
eps : float
|
||
|
Threshold value (all topics with probability < `eps` will be ignored.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
list of (int, float)
|
||
|
LDA vector for document **OR**
|
||
|
list of list of (int, float)
|
||
|
LDA vectors for corpus.
|
||
|
|
||
|
"""
|
||
|
is_corpus, dummy_corpus = utils.is_corpus(bow)
|
||
|
if not is_corpus:
|
||
|
bow = [bow]
|
||
|
|
||
|
predictions = self._predict(bow)[0]
|
||
|
|
||
|
topics = []
|
||
|
for row in predictions:
|
||
|
row_topics = []
|
||
|
for topic_id, val in enumerate(row):
|
||
|
if val > eps:
|
||
|
row_topics.append((topic_id, val))
|
||
|
topics.append(row_topics)
|
||
|
|
||
|
return topics if is_corpus else topics[0]
|
||
|
|
||
|
def _get_filename(self, name):
|
||
|
"""Get path to given filename in temp directory.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
name : str
|
||
|
Name of the file.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
str
|
||
|
Path to a file.
|
||
|
|
||
|
"""
|
||
|
return os.path.join(self.tmp_dir, name)
|
||
|
|
||
|
@property
|
||
|
def _model_filename(self):
|
||
|
"""Get path to file to write Vowpal Wabbit model to.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
str
|
||
|
Path to file to write Vowpal Wabbit model to.
|
||
|
|
||
|
"""
|
||
|
return self._get_filename('model.vw')
|
||
|
|
||
|
@property
|
||
|
def _cache_filename(self):
|
||
|
"""Get path to file to write Vowpal Wabbit cache to.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
str
|
||
|
Path to file to write Vowpal Wabbit cache to.
|
||
|
|
||
|
"""
|
||
|
return self._get_filename('cache.vw')
|
||
|
|
||
|
@property
|
||
|
def _corpus_filename(self):
|
||
|
"""Get path to file to write Vowpal Wabbit corpus to.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
str
|
||
|
Path to file to write Vowpal Wabbit corpus to.
|
||
|
|
||
|
"""
|
||
|
return self._get_filename('corpus.vw')
|
||
|
|
||
|
@property
|
||
|
def _topics_filename(self):
|
||
|
"""Get path to file to write Vowpal Wabbit topics to.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
str
|
||
|
Path to file to write Vowpal Wabbit topics to.
|
||
|
|
||
|
"""
|
||
|
return self._get_filename('topics.vw')
|
||
|
|
||
|
@property
|
||
|
def _predict_filename(self):
|
||
|
"""Get path to file to write Vowpal Wabbit predictions to.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
str
|
||
|
Path to file to write Vowpal Wabbit predictions to.
|
||
|
|
||
|
"""
|
||
|
return self._get_filename('predict.vw')
|
||
|
|
||
|
def __str__(self):
|
||
|
"""Get text representation of model."""
|
||
|
fields = ['num_terms', 'num_topics', 'chunksize', 'alpha', 'eta']
|
||
|
kv = ["{0}={1}".format(field, getattr(self, field)) for field in fields]
|
||
|
return "{0}({1})".format(self.__class__.__name__, ', '.join(kv))
|
||
|
|
||
|
|
||
|
def corpus_to_vw(corpus):
|
||
|
"""Convert corpus to Vowpal Wabbit format.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
corpus : iterable of list of (int, int)
|
||
|
Collection of texts in BoW format.
|
||
|
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
|
||
|
Vowpal Wabbit format ::
|
||
|
|
||
|
| 4:7 14:1 22:8 6:3
|
||
|
| 14:22 22:4 0:1 1:3
|
||
|
| 7:2 8:2
|
||
|
|
||
|
|
||
|
Yields
|
||
|
------
|
||
|
str
|
||
|
Corpus in Vowpal Wabbit, line by line.
|
||
|
|
||
|
"""
|
||
|
for entries in corpus:
|
||
|
line = ['|']
|
||
|
for word_id, count in entries:
|
||
|
line.append("{0}:{1}".format(word_id, count))
|
||
|
yield ' '.join(line)
|
||
|
|
||
|
|
||
|
def write_corpus_as_vw(corpus, filename):
|
||
|
"""Covert `corpus` to Vowpal Wabbit format and save it to `filename`.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
corpus : iterable of list of (int, int)
|
||
|
Collection of texts in BoW format.
|
||
|
filename : str
|
||
|
Path to output file.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
int
|
||
|
Number of lines in `filename`.
|
||
|
|
||
|
"""
|
||
|
logger.debug("Writing corpus to: %s", filename)
|
||
|
|
||
|
corpus_size = 0
|
||
|
with utils.smart_open(filename, 'wb') as corpus_file:
|
||
|
for line in corpus_to_vw(corpus):
|
||
|
corpus_file.write(line.encode('utf-8') + b'\n')
|
||
|
corpus_size += 1
|
||
|
|
||
|
return corpus_size
|
||
|
|
||
|
|
||
|
def _parse_vw_output(text):
|
||
|
"""Get dict of useful fields from Vowpal Wabbit's output.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
text : str
|
||
|
Text from vw file.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
dict of (str, float)
|
||
|
Dictionary with field "average_loss", lower bound on mean per-word log-perplexity.
|
||
|
|
||
|
"""
|
||
|
data = {}
|
||
|
for line in text.splitlines():
|
||
|
if line.startswith('average loss'):
|
||
|
data['average_loss'] = float(line.split('=')[1])
|
||
|
break
|
||
|
|
||
|
return data
|
||
|
|
||
|
|
||
|
def _run_vw_command(cmd):
|
||
|
"""Execute given Vowpal Wabbit command, log stdout and stderr.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
cmd : str
|
||
|
Given Vowpal Wabbit command to execute.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
str
|
||
|
Stdout and stderr.
|
||
|
|
||
|
Raises
|
||
|
------
|
||
|
subprocess.CalledProcessError
|
||
|
If something goes wrong.
|
||
|
|
||
|
"""
|
||
|
logger.info("Running Vowpal Wabbit command: %s", ' '.join(cmd))
|
||
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
|
||
|
stderr=subprocess.STDOUT)
|
||
|
output = proc.communicate()[0].decode('utf-8')
|
||
|
logger.debug("Vowpal Wabbit output: %s", output)
|
||
|
|
||
|
if proc.returncode != 0:
|
||
|
raise subprocess.CalledProcessError(proc.returncode, ' '.join(cmd), output=output)
|
||
|
|
||
|
return output
|
||
|
|
||
|
|
||
|
# if python2.6 support is ever dropped, can change to using int.bit_length()
|
||
|
def _bit_length(num):
|
||
|
"""Get number of bits needed to encode given number.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
num : int
|
||
|
Number to encode.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
int
|
||
|
Number of bits needed to encode given number.
|
||
|
|
||
|
"""
|
||
|
return len(bin(num).lstrip('-0b'))
|
||
|
|
||
|
|
||
|
def vwmodel2ldamodel(vw_model, iterations=50):
|
||
|
"""Convert :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit` to
|
||
|
:class:`~gensim.models.ldamodel.LdaModel`.
|
||
|
|
||
|
This works by simply copying the training model weights (alpha, beta...) from a trained vwmodel
|
||
|
into the gensim model.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
vw_model : :class:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit`
|
||
|
Trained Vowpal Wabbit model.
|
||
|
iterations : int
|
||
|
Number of iterations to be used for inference of the new :class:`~gensim.models.ldamodel.LdaModel`.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
:class:`~gensim.models.ldamodel.LdaModel`.
|
||
|
Gensim native LDA.
|
||
|
|
||
|
"""
|
||
|
model_gensim = LdaModel(
|
||
|
num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize,
|
||
|
passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay,
|
||
|
offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold,
|
||
|
dtype=numpy.float32
|
||
|
)
|
||
|
model_gensim.expElogbeta[:] = vw_model._get_topics()
|
||
|
return model_gensim
|