# Copyright (C) 2017 Parul Sethi # Copyright (C) 2017 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """Python wrapper around `Wordrank `_. Original paper: `"WordRank: Learning Word Embeddings via Robust Ranking " `_. Installation ------------ Use `official guide `_ or this one * On Linux :: sudo yum install boost-devel #(on RedHat/Centos) sudo apt-get install libboost-all-dev #(on Ubuntu) git clone https://bitbucket.org/shihaoji/wordrank cd wordrank/ # replace icc to gcc in install.sh ./install.sh * On MacOS :: brew install cmake brew install wget brew install boost brew install mercurial git clone https://bitbucket.org/shihaoji/wordrank cd wordrank/ # replace icc to gcc in install.sh ./install.sh Examples -------- >>> from gensim.models.wrappers import Wordrank >>> >>> path_to_wordrank_binary = '/path/to/wordrank/binary' >>> model = Wordrank.train(path_to_wordrank_binary, corpus_file='text8', out_name='wr_model') >>> >>> print model["hello"] # prints vector for given words Warnings -------- Note that the wrapper might not work in a docker container for large datasets due to memory limits (caused by MPI). """ from __future__ import division import logging import os import copy import multiprocessing from gensim import utils from gensim.models.keyedvectors import KeyedVectors from gensim.scripts.glove2word2vec import glove2word2vec from smart_open import smart_open from shutil import copyfile, rmtree logger = logging.getLogger(__name__) class Wordrank(KeyedVectors): """Python wrapper using `Wordrank implementation `_ Communication between Wordrank and Python takes place by working with data files on disk and calling the Wordrank binary and glove's helper binaries (for preparing training data) with subprocess module. Warnings -------- This is **only** python wrapper for `Wordrank implementation `_, you need to install original implementation first and pass the path to wordrank dir to ``wr_path``. """ @classmethod def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100, beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0): """Train model. Parameters ---------- wr_path : str Absolute path to the Wordrank directory. corpus_file : str Path to corpus file, expected space-separated tokens in a each line format. out_name : str Name of the directory which will be created (in wordrank folder) to save embeddings and training data: * ``model_word_current_.txt`` - Word Embeddings saved after every dump_period. * ``model_context_current_.txt`` - Context Embeddings saved after every dump_period. * ``meta/vocab.txt`` - vocab file. * ``meta/wiki.toy`` - word-word concurrence values. size : int, optional Dimensionality of the feature vectors. window : int, optional Number of context words to the left (and to the right, if `symmetric = 1`). symmetric : {0, 1}, optional If 1 - using symmetric windows, if 0 - will use only left context words. min_count : int, optional Ignore all words with total frequency lower than `min_count`. max_vocab_size : int, optional Upper bound on vocabulary size, i.e. keep the most frequent words. If 0 - no limit. sgd_num : int, optional Number of SGD taken for each data point. lrate : float, optional Learning rate (attention: too high diverges, give Nan). period : int, optional Period of xi variable updates. iter : int, optional Number of iterations (epochs) over the corpus. epsilon : float, optional Power scaling value for weighting function. dump_period : int, optional Period after which embeddings should be dumped. reg : int, optional Value of regularization parameter. alpha : int, optional Alpha parameter of gamma distribution. beta : int, optional Beta parameter of gamma distribution. loss : {"logistic", "hinge"}, optional Name of the loss function. memory : float, optional Soft limit for memory consumption, in GB. np : int, optional Number of process to execute (mpirun option). cleanup_files : bool, optional If True, delete directory and files used by this wrapper. sorted_vocab : {0, 1}, optional If 1 - sort the vocabulary by descending frequency before assigning word indexes, otherwise - do nothing. ensemble : {0, 1}, optional If 1 - use ensemble of word and context vectors. """ # prepare training data (cooccurrence matrix and vocab) model_dir = os.path.join(wr_path, out_name) meta_dir = os.path.join(model_dir, 'meta') os.makedirs(meta_dir) logger.info("Dumped data will be stored in '%s'", model_dir) copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1])) vocab_file = os.path.join(meta_dir, 'vocab.txt') temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt') cooccurrence_file = os.path.join(meta_dir, 'cooccurrence') cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy') meta_file = os.path.join(meta_dir, 'meta') cmd_vocab_count = [ os.path.join(wr_path, 'glove', 'vocab_count'), '-min-count', str(min_count), '-max-vocab', str(max_vocab_size) ] cmd_cooccurence_count = [ os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory), '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric) ] cmd_shuffle_cooccurences = [os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory)] cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file] commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences] input_fnames = [ os.path.join(meta_dir, os.path.split(corpus_file)[-1]), os.path.join(meta_dir, os.path.split(corpus_file)[-1]), cooccurrence_file ] output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file] logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames)) for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames): with smart_open(input_fname, 'rb') as r: with smart_open(output_fname, 'wb') as w: utils.check_output(w, args=command, stdin=r) logger.info("Deleting frequencies from vocab file") with smart_open(vocab_file, 'wb') as w: utils.check_output(w, args=cmd_del_vocab_freq) with smart_open(vocab_file, 'rb') as f: numwords = sum(1 for _ in f) with smart_open(cooccurrence_shuf_file, 'rb') as f: numlines = sum(1 for _ in f) with smart_open(meta_file, 'wb') as f: meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format( numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1], numwords, vocab_file.split('/')[-1] ) f.write(meta_info.encode('utf-8')) if iter % dump_period == 0: iter += 1 else: logger.warning( "Resultant embedding will be from %d iterations rather than the input %d iterations, " "as wordrank dumps the embedding only at dump_period intervals. " "Input an appropriate combination of parameters (iter, dump_period) " "such that \"iter mod dump_period\" is zero.", iter - (iter % dump_period), iter ) wr_args = { 'path': meta_dir, 'nthread': multiprocessing.cpu_count(), 'sgd_num': sgd_num, 'lrate': lrate, 'period': period, 'iter': iter, 'epsilon': epsilon, 'dump_prefix': 'model', 'dump_period': dump_period, 'dim': size, 'reg': reg, 'alpha': alpha, 'beta': beta, 'loss': loss } # run wordrank executable with wr_args cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')] for option, value in wr_args.items(): cmd.append('--%s' % option) cmd.append(str(value)) logger.info("Running wordrank binary") utils.check_output(args=cmd) # use embeddings from max. iteration's dump max_iter_dump = iter - (iter % dump_period) os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words')) os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts')) model = cls.load_wordrank_model( os.path.join(model_dir, 'wordrank.words'), vocab_file, os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble ) if cleanup_files: rmtree(model_dir) return model @classmethod def load_wordrank_model(cls, model_file, vocab_file=None, context_file=None, sorted_vocab=1, ensemble=1): """Load model from `model_file`. Parameters ---------- model_file : str Path to model in GloVe format. vocab_file : str, optional Path to file with vocabulary. context_file : str, optional Path to file with context-embedding in word2vec_format. sorted_vocab : {0, 1}, optional If 1 - sort the vocabulary by descending frequency before assigning word indexes, otherwise - do nothing. ensemble : {0, 1}, optional If 1 - use ensemble of word and context vectors. """ glove2word2vec(model_file, model_file + '.w2vformat') model = cls.load_word2vec_format('%s.w2vformat' % model_file) if ensemble and context_file: model.ensemble_embedding(model_file, context_file) if sorted_vocab and vocab_file: model.sort_embeddings(vocab_file) return model def sort_embeddings(self, vocab_file): """Sort embeddings according to word frequency. Parameters ---------- vocab_file : str Path to file with vocabulary. """ counts = {} vocab_size = len(self.vocab) prev_syn0 = copy.deepcopy(self.syn0) prev_vocab = copy.deepcopy(self.vocab) self.index2word = [] # sort embeddings using frequency sorted vocab file in wordrank with utils.smart_open(vocab_file) as fin: for index, line in enumerate(fin): word, count = utils.to_unicode(line).strip(), vocab_size - index # store word with it's count in a dict counts[word] = int(count) # build new index2word with frequency sorted words self.index2word.append(word) assert len(self.index2word) == vocab_size, 'mismatch between vocab sizes' for word_id, word in enumerate(self.index2word): self.syn0[word_id] = prev_syn0[prev_vocab[word].index] self.vocab[word].index = word_id self.vocab[word].count = counts[word] def ensemble_embedding(self, word_embedding, context_embedding): """Replace current syn0 with the sum of context and word embeddings. Parameters ---------- word_embedding : str Path to word embeddings in GloVe format. context_embedding : str Path to context embeddings in word2vec_format. Returns ------- numpy.ndarray Matrix with new embeddings. """ glove2word2vec(context_embedding, context_embedding + '.w2vformat') w_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % word_embedding) c_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % context_embedding) # compare vocab words using keys of dict vocab assert set(w_emb.vocab) == set(c_emb.vocab), 'Vocabs are not same for both embeddings' # sort context embedding to have words in same order as word embedding prev_c_emb = copy.deepcopy(c_emb.syn0) for word_id, word in enumerate(w_emb.index2word): c_emb.syn0[word_id] = prev_c_emb[c_emb.vocab[word].index] # add vectors of the two embeddings new_emb = w_emb.syn0 + c_emb.syn0 self.syn0 = new_emb return new_emb