#!/usr/bin/env python # -*- coding: utf-8 -*- # Authors: Chinmaya Pancholi , Shiva Manne # Copyright (C) 2017 RaRe Technologies s.r.o. """ Warnings -------- .. deprecated:: 3.3.0 Use :mod:`gensim.models.fasttext` instead. Learn word representations via fasttext's "skip-gram and CBOW models", using either hierarchical softmax or negative sampling [1]_. Notes ----- There are more ways to get word vectors in Gensim than just FastText. See wrappers for VarEmbed and WordRank or Word2Vec This module allows training a word embedding from a training corpus with the additional ability to obtain word vectors for out-of-vocabulary words. For a tutorial on gensim's native fasttext, refer to the noteboook -- [2]_ **Make sure you have a C compiler before installing gensim, to use optimized (compiled) fasttext training** .. [1] P. Bojanowski, E. Grave, A. Joulin, T. Mikolov Enriching Word Vectors with Subword Information. In arXiv preprint arXiv:1607.04606. https://arxiv.org/abs/1607.04606 .. [2] https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/FastText_Tutorial.ipynb """ import logging import numpy as np from numpy import zeros, ones, vstack, sum as np_sum, empty, float32 as REAL from gensim.models.deprecated.word2vec import Word2Vec, train_sg_pair, train_cbow_pair from gensim.models.deprecated.fasttext_wrapper import FastTextKeyedVectors from gensim.models.deprecated.fasttext_wrapper import FastText as Ft_Wrapper, compute_ngrams, ft_hash from gensim.models.fasttext import FastText as NewFastText logger = logging.getLogger(__name__) FAST_VERSION = -1 MAX_WORDS_IN_BATCH = 10000 def load_old_fasttext(*args, **kwargs): old_model = FastText.load(*args, **kwargs) params = { 'size': old_model.vector_size, 'alpha': old_model.alpha, 'window': old_model.window, 'min_count': old_model.min_count, 'max_vocab_size': old_model.__dict__.get('max_vocab_size', None), 'sample': old_model.sample, 'seed': old_model.seed, 'workers': old_model.workers, 'min_alpha': old_model.min_alpha, 'sg': old_model.sg, 'hs': old_model.hs, 'negative': old_model.negative, 'cbow_mean': old_model.cbow_mean, 'hashfxn': old_model.hashfxn, 'iter': old_model.iter, 'null_word': old_model.null_word, 'sorted_vocab': old_model.sorted_vocab, 'batch_words': old_model.batch_words, 'min_n': old_model.min_n, 'max_n': old_model.max_n, 'word_ngrams': old_model.word_ngrams, 'bucket': old_model.bucket } new_model = NewFastText(**params) # set trainables attributes new_model.wv.vectors = old_model.wv.syn0 new_model.wv.vectors_vocab = old_model.wv.syn0_vocab new_model.wv.vectors_ngrams = old_model.wv.syn0_ngrams if hasattr(old_model.wv, 'syn0norm'): new_model.wv.vectors_norm = old_model.wv.syn0norm if hasattr(old_model, 'syn1'): new_model.trainables.syn1 = old_model.syn1 if hasattr(old_model, 'syn1neg'): new_model.trainables.syn1neg = old_model.syn1neg if hasattr(old_model, 'syn0_lockf'): new_model.trainables.vectors_lockf = old_model.syn0_lockf if hasattr(old_model, 'syn0_vocab_lockf'): new_model.trainables.vectors_vocab_lockf = old_model.syn0_vocab_lockf if hasattr(old_model, 'syn0_ngrams_lockf'): new_model.trainables.vectors_ngrams_lockf = old_model.syn0_ngrams_lockf if hasattr(old_model.wv, 'syn0_vocab_norm'): new_model.trainables.vectors_vocab_norm = old_model.wv.syn0_vocab_norm if hasattr(old_model.wv, 'syn0_ngrams_norm'): new_model.trainables.vectors_ngrams_norm = old_model.wv.syn0_ngrams_norm # set vocabulary attributes new_model.wv.vocab = old_model.wv.vocab new_model.wv.index2word = old_model.wv.index2word new_model.vocabulary.cum_table = old_model.cum_table new_model.wv.hash2index = old_model.wv.hash2index new_model.train_count = old_model.train_count new_model.corpus_count = old_model.corpus_count new_model.running_training_loss = old_model.running_training_loss new_model.total_train_time = old_model.total_train_time new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached new_model.model_trimmed_post_training = old_model.model_trimmed_post_training new_model.trainables.num_ngram_vectors = old_model.num_ngram_vectors return new_model def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): """Update CBOW model by training on a sequence of sentences. Each sentence is a list of string tokens, which are looked up in the model's vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from fasttext_inner instead. Parameters ---------- model : :class:`~gensim.models.fasttext.FastText` `FastText` instance. sentences : iterable of iterables Iterable of the sentences directly from disk/network. alpha : float Learning rate. work : :class:`numpy.ndarray` Private working memory for each worker. neu1 : :class:`numpy.ndarray` Private working memory for each worker. Returns ------- int Effective number of words trained. """ result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) start = max(0, pos - model.window + reduced_window) window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] word2_subwords = [] vocab_subwords_indices = [] ngrams_subwords_indices = [] for index in word2_indices: vocab_subwords_indices += [index] word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]] for subword in word2_subwords: ngrams_subwords_indices.append(model.wv.ngrams[subword]) l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size l1 = np_sum([l1_vocab, l1_ngrams], axis=0) subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices] if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean: l1 /= (len(subwords_indices[0]) + len(subwords_indices[1])) # train on the sliding window for target word train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True) result += len(word_vocabs) return result def train_batch_sg(model, sentences, alpha, work=None, neu1=None): """Update skip-gram model by training on a sequence of sentences. Each sentence is a list of string tokens, which are looked up in the model's vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version from fasttext_inner instead. Parameters ---------- model : :class:`~gensim.models.fasttext.FastText` `FastText` instance. sentences : iterable of iterables Iterable of the sentences directly from disk/network. alpha : float Learning rate. work : :class:`numpy.ndarray` Private working memory for each worker. neu1 : :class:`numpy.ndarray` Private working memory for each worker. Returns ------- int Effective number of words trained. """ result = 0 for sentence in sentences: word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] for pos, word in enumerate(word_vocabs): reduced_window = model.random.randint(model.window) # `b` in the original word2vec code # now go over all words from the (reduced) window, predicting each one in turn start = max(0, pos - model.window + reduced_window) subwords_indices = [word.index] word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]] for subword in word2_subwords: subwords_indices.append(model.wv.ngrams[subword]) for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): if pos2 != pos: # don't train on the `word` itself train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True) result += len(word_vocabs) return result class FastText(Word2Vec): """Class for training, using and evaluating word representations learned using method described in [1]_ aka Fasttext. The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save()` and :meth:`~gensim.models.fasttext.FastText.load()` methods, or loaded in a format compatible with the original fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format()`. """ def __init__( self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH): """Initialize the model from an iterable of `sentences`. Each sentence is a list of words (unicode strings) that will be used for training. Parameters ---------- sentences : iterable of iterables The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it in some other way. sg : int {1, 0} Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed. size : int Dimensionality of the feature vectors. window : int The maximum distance between the current and predicted word within a sentence. alpha : float The initial learning rate. min_alpha : float Learning rate will linearly drop to `min_alpha` as training progresses. seed : int Seed for the random number generator. Initial vectors for each word are seeded with a hash of the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires use of the `PYTHONHASHSEED` environment variable to control hash randomization). min_count : int Ignores all words with total frequency lower than this. max_vocab_size : int Limits the RAM during vocabulary building; if there are more unique words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. Set to `None` for no limit. sample : float The threshold for configuring which higher-frequency words are randomly downsampled, useful range is (0, 1e-5). workers : int Use these many worker threads to train the model (=faster training with multicore machines). hs : int {1,0} If 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non-zero, negative sampling will be used. negative : int If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used. cbow_mean : int {1,0} If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. hashfxn : function Hash function to use to randomly initialize weights, for increased training reproducibility. iter : int Number of iterations (epochs) over the corpus. trim_rule : function Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. sorted_vocab : int {1,0} If 1, sort the vocabulary by descending frequency before assigning word indexes. batch_words : int Target size (in words) for batches of examples passed to worker threads (and thus cython routines).(Larger batches will be passed if individual texts are longer than 10000 words, but the standard cython code truncates to that maximum.) min_n : int Min length of char ngrams to be used for training word representations. max_n : int Max length of char ngrams to be used for training word representations. Set `max_n` to be lesser than `min_n` to avoid char ngrams being used. word_ngrams : int {1,0} If 1, uses enriches word vectors with subword(ngrams) information. If 0, this is equivalent to word2vec. bucket : int Character ngrams are hashed into a fixed number of buckets, in order to limit the memory usage of the model. This option specifies the number of buckets used by the model. Examples -------- Initialize and train a `FastText` model >>> from gensim.models import FastText >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] >>> >>> model = FastText(sentences, min_count=1) >>> say_vector = model['say'] # get vector for word >>> of_vector = model['of'] # get vector for out-of-vocab word """ # fastText specific params self.bucket = bucket self.word_ngrams = word_ngrams self.min_n = min_n self.max_n = max_n if self.word_ngrams <= 1 and self.max_n == 0: self.bucket = 0 super(FastText, self).__init__( sentences=sentences, size=size, alpha=alpha, window=window, min_count=min_count, max_vocab_size=max_vocab_size, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha, sg=sg, hs=hs, negative=negative, cbow_mean=cbow_mean, hashfxn=hashfxn, iter=iter, null_word=null_word, trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words) def initialize_word_vectors(self): """Initializes FastTextKeyedVectors instance to store all vocab/ngram vectors for the model.""" self.wv = FastTextKeyedVectors() self.wv.min_n = self.min_n self.wv.max_n = self.max_n def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of unicode strings. Parameters ---------- sentences : iterable of iterables The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. keep_raw_vocab : bool If not true, delete the raw vocabulary after the scaling is done and free up RAM. trim_rule : function Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), or a callable that accepts parameters (word, count, min_count) and returns either :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part of the model. progress_per : int Indicates how many words to process before showing/updating the progress. update: bool If true, the new words in `sentences` will be added to model's vocab. Example ------- Train a model and update vocab for online training >>> from gensim.models import FastText >>> sentences_1 = [["cat", "say", "meow"], ["dog", "say", "woof"]] >>> sentences_2 = [["dude", "say", "wazzup!"]] >>> >>> model = FastText(min_count=1) >>> model.build_vocab(sentences_1) >>> model.train(sentences_1, total_examples=model.corpus_count, epochs=model.iter) >>> model.build_vocab(sentences_2, update=True) >>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.iter) """ if update: if not len(self.wv.vocab): raise RuntimeError( "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " "First build the vocabulary of your model with a corpus " "before doing an online update.") self.old_vocab_len = len(self.wv.vocab) self.old_hash2index_len = len(self.wv.hash2index) super(FastText, self).build_vocab( sentences, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, progress_per=progress_per, update=update) self.init_ngrams(update=update) def init_ngrams(self, update=False): """Compute ngrams of all words present in vocabulary and stores vectors for only those ngrams. Vectors for other ngrams are initialized with a random uniform distribution in FastText. Parameters ---------- update : bool If True, the new vocab words and their new ngrams word vectors are initialized with random uniform distribution and updated/added to the existing vocab word and ngram vectors. """ if not update: self.wv.ngrams = {} self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size), dtype=REAL) self.syn0_vocab_lockf = ones((len(self.wv.vocab), self.vector_size), dtype=REAL) self.wv.syn0_ngrams = empty((self.bucket, self.vector_size), dtype=REAL) self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size), dtype=REAL) all_ngrams = [] for w, v in self.wv.vocab.items(): self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n) all_ngrams += self.wv.ngrams_word[w] all_ngrams = list(set(all_ngrams)) self.num_ngram_vectors = len(all_ngrams) logger.info("Total number of ngrams is %d", len(all_ngrams)) self.wv.hash2index = {} ngram_indices = [] new_hash_count = 0 for i, ngram in enumerate(all_ngrams): ngram_hash = ft_hash(ngram) % self.bucket if ngram_hash in self.wv.hash2index: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] else: ngram_indices.append(ngram_hash % self.bucket) self.wv.hash2index[ngram_hash] = new_hash_count self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] new_hash_count = new_hash_count + 1 self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0) self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices, axis=0) self.reset_ngram_weights() else: new_ngrams = [] for w, v in self.wv.vocab.items(): self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n) new_ngrams += [ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams] new_ngrams = list(set(new_ngrams)) logger.info("Number of new ngrams is %d", len(new_ngrams)) new_hash_count = 0 for i, ngram in enumerate(new_ngrams): ngram_hash = ft_hash(ngram) % self.bucket if ngram_hash not in self.wv.hash2index: self.wv.hash2index[ngram_hash] = new_hash_count + self.old_hash2index_len self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] new_hash_count = new_hash_count + 1 else: self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] rand_obj = np.random rand_obj.seed(self.seed) new_vocab_rows = rand_obj.uniform( -1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.vocab) - self.old_vocab_len, self.vector_size) ).astype(REAL) new_vocab_lockf_rows = ones((len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL) new_ngram_rows = rand_obj.uniform( -1.0 / self.vector_size, 1.0 / self.vector_size, (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size) ).astype(REAL) new_ngram_lockf_rows = ones( (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size), dtype=REAL) self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows]) self.syn0_vocab_lockf = vstack([self.syn0_vocab_lockf, new_vocab_lockf_rows]) self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows]) self.syn0_ngrams_lockf = vstack([self.syn0_ngrams_lockf, new_ngram_lockf_rows]) def reset_ngram_weights(self): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary and their ngrams. """ rand_obj = np.random rand_obj.seed(self.seed) for index in range(len(self.wv.vocab)): self.wv.syn0_vocab[index] = rand_obj.uniform( -1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size ).astype(REAL) for index in range(len(self.wv.hash2index)): self.wv.syn0_ngrams[index] = rand_obj.uniform( -1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size ).astype(REAL) def _do_train_job(self, sentences, alpha, inits): """Train a single batch of sentences. Return 2-tuple `(effective word count after ignoring unknown words and sentence length trimming, total word count)`. Parameters ---------- sentences : iterable of iterables The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. alpha : float The current learning rate. inits : (:class:`numpy.ndarray`, :class:`numpy.ndarray`) Each worker's private work memory. Returns ------- (int, int) Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count) """ work, neu1 = inits tally = 0 if self.sg: tally += train_batch_sg(self, sentences, alpha, work, neu1) else: tally += train_batch_cbow(self, sentences, alpha, work, neu1) return tally, self._raw_word_count(sentences) def train(self, sentences, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For FastText, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate progress-percentage logging, either total_examples (count of sentences) or total_words (count of raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to :meth:`~gensim.models.fasttext.FastText.build_vocab()`, the count of examples in that corpus will be available in the model's :attr:`corpus_count` property). To avoid common mistakes around the model's ability to do multiple training passes itself, an explicit `epochs` argument **MUST** be provided. In the common and recommended case, where :meth:`~gensim.models.fasttext.FastText.train()` is only called once, the model's cached `iter` value should be supplied as `epochs` value. Parameters ---------- sentences : iterable of iterables The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. total_examples : int Count of sentences. total_words : int Count of raw words in sentences. epochs : int Number of iterations (epochs) over the corpus. start_alpha : float Initial learning rate. end_alpha : float Final learning rate. Drops linearly from `start_alpha`. word_count : int Count of words already trained. Set this to 0 for the usual case of training on all words in sentences. queue_factor : int Multiplier for size of queue (number of workers * queue_factor). report_delay : float Seconds to wait before reporting progress. Examples -------- >>> from gensim.models import FastText >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] >>> >>> model = FastText(min_count=1) >>> model.build_vocab(sentences) >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) """ self.neg_labels = [] if self.negative > 0: # precompute negative labels optimization for pure-python training self.neg_labels = zeros(self.negative + 1) self.neg_labels[0] = 1. Word2Vec.train( self, sentences, total_examples=self.corpus_count, epochs=self.iter, start_alpha=self.alpha, end_alpha=self.min_alpha) self.get_vocab_word_vecs() def __getitem__(self, word): """Get `word` representations in vector space, as a 1D numpy array. Parameters ---------- word : str A single word whose vector needs to be returned. Returns ------- :class:`numpy.ndarray` The word's representations in vector space, as a 1D numpy array. Raises ------ KeyError For words with all ngrams absent, a KeyError is raised. Example ------- >>> from gensim.models import FastText >>> from gensim.test.utils import datapath >>> >>> trained_model = FastText.load_fasttext_format(datapath('lee_fasttext')) >>> meow_vector = trained_model['hello'] # get vector for word """ return self.word_vec(word) def get_vocab_word_vecs(self): """Calculate vectors for words in vocabulary and stores them in `wv.syn0`.""" for w, v in self.wv.vocab.items(): word_vec = np.copy(self.wv.syn0_vocab[v.index]) ngrams = self.wv.ngrams_word[w] ngram_weights = self.wv.syn0_ngrams for ngram in ngrams: word_vec += ngram_weights[self.wv.ngrams[ngram]] word_vec /= (len(ngrams) + 1) self.wv.syn0[v.index] = word_vec def word_vec(self, word, use_norm=False): """Get the word's representations in vector space, as a 1D numpy array. Parameters ---------- word : str A single word whose vector needs to be returned. use_norm : bool If True, returns normalized vector. Returns ------- :class:`numpy.ndarray` The word's representations in vector space, as a 1D numpy array. Raises ------ KeyError For words with all ngrams absent, a KeyError is raised. Example ------- >>> from gensim.models import FastText >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] >>> >>> model = FastText(sentences, min_count=1) >>> meow_vector = model.word_vec('meow') # get vector for word """ return FastTextKeyedVectors.word_vec(self.wv, word, use_norm=use_norm) @classmethod def load_fasttext_format(cls, *args, **kwargs): """Load a :class:`~gensim.models.fasttext.FastText` model from a format compatible with the original fasttext implementation. Parameters ---------- fname : str Path to the file. """ return Ft_Wrapper.load_fasttext_format(*args, **kwargs) def save(self, *args, **kwargs): """Save the model. This saved model can be loaded again using :func:`~gensim.models.fasttext.FastText.load`, which supports online training and getting vectors for out-of-vocabulary words. Parameters ---------- fname : str Path to the file. """ kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm']) super(FastText, self).save(*args, **kwargs)