laywerrobot/lib/python3.6/site-packages/gensim/models/wrappers/wordrank.py

# Copyright (C) 2017 Parul Sethi <parul1sethi@gmail.com>
# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""Python wrapper around `Wordrank <https://bitbucket.org/shihaoji/wordrank/>`_.
Original paper: `"WordRank: Learning Word Embeddings via Robust Ranking " <https://arxiv.org/pdf/1506.02761v3.pdf>`_.

Installation
------------
Use `official guide <https://github.com/shihaoji/wordrank>`_ or this one

* On Linux ::

    sudo yum install boost-devel #(on RedHat/Centos)
    sudo apt-get install libboost-all-dev #(on Ubuntu)

    git clone https://bitbucket.org/shihaoji/wordrank
    cd wordrank/
    # replace icc to gcc in install.sh
    ./install.sh

* On MacOS ::

    brew install cmake
    brew install wget
    brew install boost
    brew install mercurial

    git clone https://bitbucket.org/shihaoji/wordrank
    cd wordrank/
    # replace icc to gcc in install.sh
    ./install.sh

Examples
--------

>>> from gensim.models.wrappers import Wordrank
>>>
>>> path_to_wordrank_binary = '/path/to/wordrank/binary'
>>> model = Wordrank.train(path_to_wordrank_binary, corpus_file='text8', out_name='wr_model')
>>>
>>> print model["hello"]  # prints vector for given words

Warnings
--------
Note that the wrapper might not work in a docker container for large datasets due to memory limits (caused by MPI).

"""

from __future__ import division

import logging
import os
import copy
import multiprocessing

from gensim import utils
from gensim.models.keyedvectors import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

from smart_open import smart_open
from shutil import copyfile, rmtree


logger = logging.getLogger(__name__)


class Wordrank(KeyedVectors):
    """Python wrapper using `Wordrank implementation <https://bitbucket.org/shihaoji/wordrank/>`_

    Communication between Wordrank and Python takes place by working with data
    files on disk and calling the Wordrank binary and glove's helper binaries
    (for preparing training data) with subprocess module.

    Warnings
    --------
    This is **only** python wrapper for `Wordrank implementation <https://bitbucket.org/shihaoji/wordrank/>`_,
    you need to install original implementation first and pass the path to wordrank dir to ``wr_path``.

    """
    @classmethod
    def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0,
              sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100,
              beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0):
        """Train model.

        Parameters
        ----------
        wr_path : str
            Absolute path to the Wordrank directory.
        corpus_file : str
            Path to corpus file, expected space-separated tokens in a each line format.
        out_name : str
            Name of the directory which will be created (in wordrank folder) to save embeddings and training data:
                * ``model_word_current_<iter>.txt`` - Word Embeddings saved after every dump_period.
                * ``model_context_current_<iter>.txt`` - Context Embeddings saved after every dump_period.
                * ``meta/vocab.txt`` - vocab file.
                * ``meta/wiki.toy`` - word-word concurrence values.
        size : int, optional
            Dimensionality of the feature vectors.
        window : int, optional
            Number of context words to the left (and to the right, if `symmetric = 1`).
        symmetric : {0, 1}, optional
            If 1 - using symmetric windows, if 0 - will use only left context words.
        min_count : int, optional
            Ignore all words with total frequency lower than `min_count`.
        max_vocab_size : int, optional
            Upper bound on vocabulary size, i.e. keep the <int> most frequent words. If 0 - no limit.
        sgd_num : int, optional
            Number of SGD taken for each data point.
        lrate : float, optional
            Learning rate (attention: too high diverges, give Nan).
        period : int, optional
            Period of xi variable updates.
        iter : int, optional
            Number of iterations (epochs) over the corpus.
        epsilon : float, optional
            Power scaling value for weighting function.
        dump_period : int, optional
            Period after which embeddings should be dumped.
        reg : int, optional
            Value of regularization parameter.
        alpha : int, optional
            Alpha parameter of gamma distribution.
        beta : int, optional
            Beta parameter of gamma distribution.
        loss : {"logistic", "hinge"}, optional
            Name of the loss function.
        memory : float, optional
            Soft limit for memory consumption, in GB.
        np : int, optional
            Number of process to execute (mpirun option).
        cleanup_files : bool, optional
            If True, delete directory and files used by this wrapper.
        sorted_vocab : {0, 1}, optional
            If 1 - sort the vocabulary by descending frequency before assigning word indexes, otherwise - do nothing.
        ensemble : {0, 1}, optional
            If 1 - use ensemble of word and context vectors.

        """

        # prepare training data (cooccurrence matrix and vocab)
        model_dir = os.path.join(wr_path, out_name)
        meta_dir = os.path.join(model_dir, 'meta')
        os.makedirs(meta_dir)
        logger.info("Dumped data will be stored in '%s'", model_dir)
        copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1]))

        vocab_file = os.path.join(meta_dir, 'vocab.txt')
        temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt')
        cooccurrence_file = os.path.join(meta_dir, 'cooccurrence')
        cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy')
        meta_file = os.path.join(meta_dir, 'meta')

        cmd_vocab_count = [
            os.path.join(wr_path, 'glove', 'vocab_count'),
            '-min-count', str(min_count), '-max-vocab', str(max_vocab_size)
        ]
        cmd_cooccurence_count = [
            os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory),
            '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric)
        ]
        cmd_shuffle_cooccurences = [os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory)]
        cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file]

        commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences]
        input_fnames = [
            os.path.join(meta_dir, os.path.split(corpus_file)[-1]),
            os.path.join(meta_dir, os.path.split(corpus_file)[-1]),
            cooccurrence_file
        ]
        output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file]

        logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames))
        for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames):
            with smart_open(input_fname, 'rb') as r:
                with smart_open(output_fname, 'wb') as w:
                    utils.check_output(w, args=command, stdin=r)

        logger.info("Deleting frequencies from vocab file")
        with smart_open(vocab_file, 'wb') as w:
            utils.check_output(w, args=cmd_del_vocab_freq)

        with smart_open(vocab_file, 'rb') as f:
            numwords = sum(1 for _ in f)
        with smart_open(cooccurrence_shuf_file, 'rb') as f:
            numlines = sum(1 for _ in f)
        with smart_open(meta_file, 'wb') as f:
            meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(
                numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1],
                numwords, vocab_file.split('/')[-1]
            )
            f.write(meta_info.encode('utf-8'))

        if iter % dump_period == 0:
            iter += 1
        else:
            logger.warning(
                "Resultant embedding will be from %d iterations rather than the input %d iterations, "
                "as wordrank dumps the embedding only at dump_period intervals. "
                "Input an appropriate combination of parameters (iter, dump_period) "
                "such that \"iter mod dump_period\" is zero.",
                iter - (iter % dump_period), iter
            )

        wr_args = {
            'path': meta_dir,
            'nthread': multiprocessing.cpu_count(),
            'sgd_num': sgd_num,
            'lrate': lrate,
            'period': period,
            'iter': iter,
            'epsilon': epsilon,
            'dump_prefix': 'model',
            'dump_period': dump_period,
            'dim': size,
            'reg': reg,
            'alpha': alpha,
            'beta': beta,
            'loss': loss
        }

        # run wordrank executable with wr_args
        cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')]
        for option, value in wr_args.items():
            cmd.append('--%s' % option)
            cmd.append(str(value))
        logger.info("Running wordrank binary")
        utils.check_output(args=cmd)

        # use embeddings from max. iteration's dump
        max_iter_dump = iter - (iter % dump_period)
        os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words'))
        os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts'))
        model = cls.load_wordrank_model(
            os.path.join(model_dir, 'wordrank.words'), vocab_file,
            os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble
        )

        if cleanup_files:
            rmtree(model_dir)
        return model

    @classmethod
    def load_wordrank_model(cls, model_file, vocab_file=None, context_file=None, sorted_vocab=1, ensemble=1):
        """Load model from `model_file`.

        Parameters
        ----------
        model_file : str
            Path to model in GloVe format.
        vocab_file : str, optional
            Path to file with vocabulary.
        context_file : str, optional
            Path to file with context-embedding in word2vec_format.
        sorted_vocab : {0, 1}, optional
            If 1 - sort the vocabulary by descending frequency before assigning word indexes, otherwise - do nothing.
        ensemble : {0, 1}, optional
            If 1 - use ensemble of word and context vectors.

        """
        glove2word2vec(model_file, model_file + '.w2vformat')
        model = cls.load_word2vec_format('%s.w2vformat' % model_file)
        if ensemble and context_file:
            model.ensemble_embedding(model_file, context_file)
        if sorted_vocab and vocab_file:
            model.sort_embeddings(vocab_file)
        return model

    def sort_embeddings(self, vocab_file):
        """Sort embeddings according to word frequency.

        Parameters
        ----------
        vocab_file : str
            Path to file with vocabulary.

        """
        counts = {}
        vocab_size = len(self.vocab)
        prev_syn0 = copy.deepcopy(self.syn0)
        prev_vocab = copy.deepcopy(self.vocab)
        self.index2word = []

        # sort embeddings using frequency sorted vocab file in wordrank
        with utils.smart_open(vocab_file) as fin:
            for index, line in enumerate(fin):
                word, count = utils.to_unicode(line).strip(), vocab_size - index
                # store word with it's count in a dict
                counts[word] = int(count)
                # build new index2word with frequency sorted words
                self.index2word.append(word)
        assert len(self.index2word) == vocab_size, 'mismatch between vocab sizes'

        for word_id, word in enumerate(self.index2word):
            self.syn0[word_id] = prev_syn0[prev_vocab[word].index]
            self.vocab[word].index = word_id
            self.vocab[word].count = counts[word]

    def ensemble_embedding(self, word_embedding, context_embedding):
        """Replace current syn0 with the sum of context and word embeddings.

        Parameters
        ----------
        word_embedding : str
            Path to word embeddings in GloVe format.
        context_embedding : str
            Path to context embeddings in word2vec_format.

        Returns
        -------
        numpy.ndarray
            Matrix with new embeddings.

        """
        glove2word2vec(context_embedding, context_embedding + '.w2vformat')
        w_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % word_embedding)
        c_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % context_embedding)
        # compare vocab words using keys of dict vocab
        assert set(w_emb.vocab) == set(c_emb.vocab), 'Vocabs are not same for both embeddings'

        # sort context embedding to have words in same order as word embedding
        prev_c_emb = copy.deepcopy(c_emb.syn0)
        for word_id, word in enumerate(w_emb.index2word):
            c_emb.syn0[word_id] = prev_c_emb[c_emb.vocab[word].index]
        # add vectors of the two embeddings
        new_emb = w_emb.syn0 + c_emb.syn0
        self.syn0 = new_emb
        return new_emb
first commit 2020-08-27 21:55:39 +02:00			`# Copyright (C) 2017 Parul Sethi <parul1sethi@gmail.com>`
			`# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>`
			`# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html`

			"""Python wrapper around `Wordrank <https://bitbucket.org/shihaoji/wordrank/>`_.
			Original paper: `"WordRank: Learning Word Embeddings via Robust Ranking " <https://arxiv.org/pdf/1506.02761v3.pdf>`_.

			`Installation`
			`------------`
			Use `official guide <https://github.com/shihaoji/wordrank>`_ or this one

			`* On Linux ::`

			`sudo yum install boost-devel #(on RedHat/Centos)`
			`sudo apt-get install libboost-all-dev #(on Ubuntu)`

			`git clone https://bitbucket.org/shihaoji/wordrank`
			`cd wordrank/`
			`# replace icc to gcc in install.sh`
			`./install.sh`

			`* On MacOS ::`

			`brew install cmake`
			`brew install wget`
			`brew install boost`
			`brew install mercurial`

			`git clone https://bitbucket.org/shihaoji/wordrank`
			`cd wordrank/`
			`# replace icc to gcc in install.sh`
			`./install.sh`

			`Examples`
			`--------`

			`>>> from gensim.models.wrappers import Wordrank`
			`>>>`
			`>>> path_to_wordrank_binary = '/path/to/wordrank/binary'`
			`>>> model = Wordrank.train(path_to_wordrank_binary, corpus_file='text8', out_name='wr_model')`
			`>>>`
			`>>> print model["hello"] # prints vector for given words`

			`Warnings`
			`--------`
			`Note that the wrapper might not work in a docker container for large datasets due to memory limits (caused by MPI).`

			`"""`

			`from __future__ import division`

			`import logging`
			`import os`
			`import copy`
			`import multiprocessing`

			`from gensim import utils`
			`from gensim.models.keyedvectors import KeyedVectors`
			`from gensim.scripts.glove2word2vec import glove2word2vec`

			`from smart_open import smart_open`
			`from shutil import copyfile, rmtree`


			`logger = logging.getLogger(__name__)`


			`class Wordrank(KeyedVectors):`
			"""Python wrapper using `Wordrank implementation <https://bitbucket.org/shihaoji/wordrank/>`_

			`Communication between Wordrank and Python takes place by working with data`
			`files on disk and calling the Wordrank binary and glove's helper binaries`
			`(for preparing training data) with subprocess module.`

			`Warnings`
			`--------`
			This is only python wrapper for `Wordrank implementation <https://bitbucket.org/shihaoji/wordrank/>`_,
			you need to install original implementation first and pass the path to wordrank dir to ``wr_path``.

			`"""`
			`@classmethod`
			`def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0,`
			`sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100,`
			`beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0):`
			`"""Train model.`

			`Parameters`
			`----------`
			`wr_path : str`
			`Absolute path to the Wordrank directory.`
			`corpus_file : str`
			`Path to corpus file, expected space-separated tokens in a each line format.`
			`out_name : str`
			`Name of the directory which will be created (in wordrank folder) to save embeddings and training data:`
			* ``model_word_current_<iter>.txt`` - Word Embeddings saved after every dump_period.
			* ``model_context_current_<iter>.txt`` - Context Embeddings saved after every dump_period.
			* ``meta/vocab.txt`` - vocab file.
			* ``meta/wiki.toy`` - word-word concurrence values.
			`size : int, optional`
			`Dimensionality of the feature vectors.`
			`window : int, optional`
			Number of context words to the left (and to the right, if `symmetric = 1`).
			`symmetric : {0, 1}, optional`
			`If 1 - using symmetric windows, if 0 - will use only left context words.`
			`min_count : int, optional`
			Ignore all words with total frequency lower than `min_count`.
			`max_vocab_size : int, optional`
			`Upper bound on vocabulary size, i.e. keep the <int> most frequent words. If 0 - no limit.`
			`sgd_num : int, optional`
			`Number of SGD taken for each data point.`
			`lrate : float, optional`
			`Learning rate (attention: too high diverges, give Nan).`
			`period : int, optional`
			`Period of xi variable updates.`
			`iter : int, optional`
			`Number of iterations (epochs) over the corpus.`
			`epsilon : float, optional`
			`Power scaling value for weighting function.`
			`dump_period : int, optional`
			`Period after which embeddings should be dumped.`
			`reg : int, optional`
			`Value of regularization parameter.`
			`alpha : int, optional`
			`Alpha parameter of gamma distribution.`
			`beta : int, optional`
			`Beta parameter of gamma distribution.`
			`loss : {"logistic", "hinge"}, optional`
			`Name of the loss function.`
			`memory : float, optional`
			`Soft limit for memory consumption, in GB.`
			`np : int, optional`
			`Number of process to execute (mpirun option).`
			`cleanup_files : bool, optional`
			`If True, delete directory and files used by this wrapper.`
			`sorted_vocab : {0, 1}, optional`
			`If 1 - sort the vocabulary by descending frequency before assigning word indexes, otherwise - do nothing.`
			`ensemble : {0, 1}, optional`
			`If 1 - use ensemble of word and context vectors.`

			`"""`

			`# prepare training data (cooccurrence matrix and vocab)`
			`model_dir = os.path.join(wr_path, out_name)`
			`meta_dir = os.path.join(model_dir, 'meta')`
			`os.makedirs(meta_dir)`
			`logger.info("Dumped data will be stored in '%s'", model_dir)`
			`copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1]))`

			`vocab_file = os.path.join(meta_dir, 'vocab.txt')`
			`temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt')`
			`cooccurrence_file = os.path.join(meta_dir, 'cooccurrence')`
			`cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy')`
			`meta_file = os.path.join(meta_dir, 'meta')`

			`cmd_vocab_count = [`
			`os.path.join(wr_path, 'glove', 'vocab_count'),`
			`'-min-count', str(min_count), '-max-vocab', str(max_vocab_size)`
			`]`
			`cmd_cooccurence_count = [`
			`os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory),`
			`'-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric)`
			`]`
			`cmd_shuffle_cooccurences = [os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory)]`
			`cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file]`

			`commands = [cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences]`
			`input_fnames = [`
			`os.path.join(meta_dir, os.path.split(corpus_file)[-1]),`
			`os.path.join(meta_dir, os.path.split(corpus_file)[-1]),`
			`cooccurrence_file`
			`]`
			`output_fnames = [temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file]`

			`logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames))`
			`for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames):`
			`with smart_open(input_fname, 'rb') as r:`
			`with smart_open(output_fname, 'wb') as w:`
			`utils.check_output(w, args=command, stdin=r)`

			`logger.info("Deleting frequencies from vocab file")`
			`with smart_open(vocab_file, 'wb') as w:`
			`utils.check_output(w, args=cmd_del_vocab_freq)`

			`with smart_open(vocab_file, 'rb') as f:`
			`numwords = sum(1 for _ in f)`
			`with smart_open(cooccurrence_shuf_file, 'rb') as f:`
			`numlines = sum(1 for _ in f)`
			`with smart_open(meta_file, 'wb') as f:`
			`meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(`
			`numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1],`
			`numwords, vocab_file.split('/')[-1]`
			`)`
			`f.write(meta_info.encode('utf-8'))`

			`if iter % dump_period == 0:`
			`iter += 1`
			`else:`
			`logger.warning(`
			`"Resultant embedding will be from %d iterations rather than the input %d iterations, "`
			`"as wordrank dumps the embedding only at dump_period intervals. "`
			`"Input an appropriate combination of parameters (iter, dump_period) "`
			`"such that \"iter mod dump_period\" is zero.",`
			`iter - (iter % dump_period), iter`
			`)`

			`wr_args = {`
			`'path': meta_dir,`
			`'nthread': multiprocessing.cpu_count(),`
			`'sgd_num': sgd_num,`
			`'lrate': lrate,`
			`'period': period,`
			`'iter': iter,`
			`'epsilon': epsilon,`
			`'dump_prefix': 'model',`
			`'dump_period': dump_period,`
			`'dim': size,`
			`'reg': reg,`
			`'alpha': alpha,`
			`'beta': beta,`
			`'loss': loss`
			`}`

			`# run wordrank executable with wr_args`
			`cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')]`
			`for option, value in wr_args.items():`
			`cmd.append('--%s' % option)`
			`cmd.append(str(value))`
			`logger.info("Running wordrank binary")`
			`utils.check_output(args=cmd)`

			`# use embeddings from max. iteration's dump`
			`max_iter_dump = iter - (iter % dump_period)`
			`os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words'))`
			`os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts'))`
			`model = cls.load_wordrank_model(`
			`os.path.join(model_dir, 'wordrank.words'), vocab_file,`
			`os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble`
			`)`

			`if cleanup_files:`
			`rmtree(model_dir)`
			`return model`

			`@classmethod`
			`def load_wordrank_model(cls, model_file, vocab_file=None, context_file=None, sorted_vocab=1, ensemble=1):`
			"""Load model from `model_file`.

			`Parameters`
			`----------`
			`model_file : str`
			`Path to model in GloVe format.`
			`vocab_file : str, optional`
			`Path to file with vocabulary.`
			`context_file : str, optional`
			`Path to file with context-embedding in word2vec_format.`
			`sorted_vocab : {0, 1}, optional`
			`If 1 - sort the vocabulary by descending frequency before assigning word indexes, otherwise - do nothing.`
			`ensemble : {0, 1}, optional`
			`If 1 - use ensemble of word and context vectors.`

			`"""`
			`glove2word2vec(model_file, model_file + '.w2vformat')`
			`model = cls.load_word2vec_format('%s.w2vformat' % model_file)`
			`if ensemble and context_file:`
			`model.ensemble_embedding(model_file, context_file)`
			`if sorted_vocab and vocab_file:`
			`model.sort_embeddings(vocab_file)`
			`return model`

			`def sort_embeddings(self, vocab_file):`
			`"""Sort embeddings according to word frequency.`

			`Parameters`
			`----------`
			`vocab_file : str`
			`Path to file with vocabulary.`

			`"""`
			`counts = {}`
			`vocab_size = len(self.vocab)`
			`prev_syn0 = copy.deepcopy(self.syn0)`
			`prev_vocab = copy.deepcopy(self.vocab)`
			`self.index2word = []`

			`# sort embeddings using frequency sorted vocab file in wordrank`
			`with utils.smart_open(vocab_file) as fin:`
			`for index, line in enumerate(fin):`
			`word, count = utils.to_unicode(line).strip(), vocab_size - index`
			`# store word with it's count in a dict`
			`counts[word] = int(count)`
			`# build new index2word with frequency sorted words`
			`self.index2word.append(word)`
			`assert len(self.index2word) == vocab_size, 'mismatch between vocab sizes'`

			`for word_id, word in enumerate(self.index2word):`
			`self.syn0[word_id] = prev_syn0[prev_vocab[word].index]`
			`self.vocab[word].index = word_id`
			`self.vocab[word].count = counts[word]`

			`def ensemble_embedding(self, word_embedding, context_embedding):`
			`"""Replace current syn0 with the sum of context and word embeddings.`

			`Parameters`
			`----------`
			`word_embedding : str`
			`Path to word embeddings in GloVe format.`
			`context_embedding : str`
			`Path to context embeddings in word2vec_format.`

			`Returns`
			`-------`
			`numpy.ndarray`
			`Matrix with new embeddings.`

			`"""`
			`glove2word2vec(context_embedding, context_embedding + '.w2vformat')`
			`w_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % word_embedding)`
			`c_emb = KeyedVectors.load_word2vec_format('%s.w2vformat' % context_embedding)`
			`# compare vocab words using keys of dict vocab`
			`assert set(w_emb.vocab) == set(c_emb.vocab), 'Vocabs are not same for both embeddings'`

			`# sort context embedding to have words in same order as word embedding`
			`prev_c_emb = copy.deepcopy(c_emb.syn0)`
			`for word_id, word in enumerate(w_emb.index2word):`
			`c_emb.syn0[word_id] = prev_c_emb[c_emb.vocab[word].index]`
			`# add vectors of the two embeddings`
			`new_emb = w_emb.syn0 + c_emb.syn0`
			`self.syn0 = new_emb`
			`return new_emb`