laywerrobot/lib/python3.6/site-packages/gensim/models/deprecated/fasttext.py

702 lines
31 KiB
Python
Raw Normal View History

2020-08-27 21:55:39 +02:00
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Authors: Chinmaya Pancholi <chinmayapancholi13@gmail.com>, Shiva Manne <s.manne@rare-technologies.com>
# Copyright (C) 2017 RaRe Technologies s.r.o.
"""
Warnings
--------
.. deprecated:: 3.3.0
Use :mod:`gensim.models.fasttext` instead.
Learn word representations via fasttext's "skip-gram and CBOW models", using either
hierarchical softmax or negative sampling [1]_.
Notes
-----
There are more ways to get word vectors in Gensim than just FastText.
See wrappers for VarEmbed and WordRank or Word2Vec
This module allows training a word embedding from a training corpus with the additional ability
to obtain word vectors for out-of-vocabulary words.
For a tutorial on gensim's native fasttext, refer to the noteboook -- [2]_
**Make sure you have a C compiler before installing gensim, to use optimized (compiled) fasttext training**
.. [1] P. Bojanowski, E. Grave, A. Joulin, T. Mikolov
Enriching Word Vectors with Subword Information. In arXiv preprint arXiv:1607.04606.
https://arxiv.org/abs/1607.04606
.. [2] https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/FastText_Tutorial.ipynb
"""
import logging
import numpy as np
from numpy import zeros, ones, vstack, sum as np_sum, empty, float32 as REAL
from gensim.models.deprecated.word2vec import Word2Vec, train_sg_pair, train_cbow_pair
from gensim.models.deprecated.fasttext_wrapper import FastTextKeyedVectors
from gensim.models.deprecated.fasttext_wrapper import FastText as Ft_Wrapper, compute_ngrams, ft_hash
from gensim.models.fasttext import FastText as NewFastText
logger = logging.getLogger(__name__)
FAST_VERSION = -1
MAX_WORDS_IN_BATCH = 10000
def load_old_fasttext(*args, **kwargs):
old_model = FastText.load(*args, **kwargs)
params = {
'size': old_model.vector_size,
'alpha': old_model.alpha,
'window': old_model.window,
'min_count': old_model.min_count,
'max_vocab_size': old_model.__dict__.get('max_vocab_size', None),
'sample': old_model.sample,
'seed': old_model.seed,
'workers': old_model.workers,
'min_alpha': old_model.min_alpha,
'sg': old_model.sg,
'hs': old_model.hs,
'negative': old_model.negative,
'cbow_mean': old_model.cbow_mean,
'hashfxn': old_model.hashfxn,
'iter': old_model.iter,
'null_word': old_model.null_word,
'sorted_vocab': old_model.sorted_vocab,
'batch_words': old_model.batch_words,
'min_n': old_model.min_n,
'max_n': old_model.max_n,
'word_ngrams': old_model.word_ngrams,
'bucket': old_model.bucket
}
new_model = NewFastText(**params)
# set trainables attributes
new_model.wv.vectors = old_model.wv.syn0
new_model.wv.vectors_vocab = old_model.wv.syn0_vocab
new_model.wv.vectors_ngrams = old_model.wv.syn0_ngrams
if hasattr(old_model.wv, 'syn0norm'):
new_model.wv.vectors_norm = old_model.wv.syn0norm
if hasattr(old_model, 'syn1'):
new_model.trainables.syn1 = old_model.syn1
if hasattr(old_model, 'syn1neg'):
new_model.trainables.syn1neg = old_model.syn1neg
if hasattr(old_model, 'syn0_lockf'):
new_model.trainables.vectors_lockf = old_model.syn0_lockf
if hasattr(old_model, 'syn0_vocab_lockf'):
new_model.trainables.vectors_vocab_lockf = old_model.syn0_vocab_lockf
if hasattr(old_model, 'syn0_ngrams_lockf'):
new_model.trainables.vectors_ngrams_lockf = old_model.syn0_ngrams_lockf
if hasattr(old_model.wv, 'syn0_vocab_norm'):
new_model.trainables.vectors_vocab_norm = old_model.wv.syn0_vocab_norm
if hasattr(old_model.wv, 'syn0_ngrams_norm'):
new_model.trainables.vectors_ngrams_norm = old_model.wv.syn0_ngrams_norm
# set vocabulary attributes
new_model.wv.vocab = old_model.wv.vocab
new_model.wv.index2word = old_model.wv.index2word
new_model.vocabulary.cum_table = old_model.cum_table
new_model.wv.hash2index = old_model.wv.hash2index
new_model.train_count = old_model.train_count
new_model.corpus_count = old_model.corpus_count
new_model.running_training_loss = old_model.running_training_loss
new_model.total_train_time = old_model.total_train_time
new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached
new_model.model_trimmed_post_training = old_model.model_trimmed_post_training
new_model.trainables.num_ngram_vectors = old_model.num_ngram_vectors
return new_model
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):
"""Update CBOW model by training on a sequence of sentences.
Each sentence is a list of string tokens, which are looked up in the model's
vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`.
This is the non-optimized, Python version. If you have cython installed, gensim
will use the optimized version from fasttext_inner instead.
Parameters
----------
model : :class:`~gensim.models.fasttext.FastText`
`FastText` instance.
sentences : iterable of iterables
Iterable of the sentences directly from disk/network.
alpha : float
Learning rate.
work : :class:`numpy.ndarray`
Private working memory for each worker.
neu1 : :class:`numpy.ndarray`
Private working memory for each worker.
Returns
-------
int
Effective number of words trained.
"""
result = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window)
start = max(0, pos - model.window + reduced_window)
window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
word2_subwords = []
vocab_subwords_indices = []
ngrams_subwords_indices = []
for index in word2_indices:
vocab_subwords_indices += [index]
word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]]
for subword in word2_subwords:
ngrams_subwords_indices.append(model.wv.ngrams[subword])
l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size
l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size
l1 = np_sum([l1_vocab, l1_ngrams], axis=0)
subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices]
if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean:
l1 /= (len(subwords_indices[0]) + len(subwords_indices[1]))
# train on the sliding window for target word
train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True)
result += len(word_vocabs)
return result
def train_batch_sg(model, sentences, alpha, work=None, neu1=None):
"""Update skip-gram model by training on a sequence of sentences.
Each sentence is a list of string tokens, which are looked up in the model's
vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`.
This is the non-optimized, Python version. If you have cython installed, gensim
will use the optimized version from fasttext_inner instead.
Parameters
----------
model : :class:`~gensim.models.fasttext.FastText`
`FastText` instance.
sentences : iterable of iterables
Iterable of the sentences directly from disk/network.
alpha : float
Learning rate.
work : :class:`numpy.ndarray`
Private working memory for each worker.
neu1 : :class:`numpy.ndarray`
Private working memory for each worker.
Returns
-------
int
Effective number of words trained.
"""
result = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
# now go over all words from the (reduced) window, predicting each one in turn
start = max(0, pos - model.window + reduced_window)
subwords_indices = [word.index]
word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]]
for subword in word2_subwords:
subwords_indices.append(model.wv.ngrams[subword])
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
if pos2 != pos: # don't train on the `word` itself
train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True)
result += len(word_vocabs)
return result
class FastText(Word2Vec):
"""Class for training, using and evaluating word representations learned using method
described in [1]_ aka Fasttext.
The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save()` and
:meth:`~gensim.models.fasttext.FastText.load()` methods, or loaded in a format compatible with the original
fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format()`.
"""
def __init__(
self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1,
bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH):
"""Initialize the model from an iterable of `sentences`. Each sentence is a
list of words (unicode strings) that will be used for training.
Parameters
----------
sentences : iterable of iterables
The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
in some other way.
sg : int {1, 0}
Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed.
size : int
Dimensionality of the feature vectors.
window : int
The maximum distance between the current and predicted word within a sentence.
alpha : float
The initial learning rate.
min_alpha : float
Learning rate will linearly drop to `min_alpha` as training progresses.
seed : int
Seed for the random number generator. Initial vectors for each word are seeded with a hash of
the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
use of the `PYTHONHASHSEED` environment variable to control hash randomization).
min_count : int
Ignores all words with total frequency lower than this.
max_vocab_size : int
Limits the RAM during vocabulary building; if there are more unique
words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
Set to `None` for no limit.
sample : float
The threshold for configuring which higher-frequency words are randomly downsampled,
useful range is (0, 1e-5).
workers : int
Use these many worker threads to train the model (=faster training with multicore machines).
hs : int {1,0}
If 1, hierarchical softmax will be used for model training.
If set to 0, and `negative` is non-zero, negative sampling will be used.
negative : int
If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
should be drawn (usually between 5-20).
If set to 0, no negative sampling is used.
cbow_mean : int {1,0}
If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
hashfxn : function
Hash function to use to randomly initialize weights, for increased training reproducibility.
iter : int
Number of iterations (epochs) over the corpus.
trim_rule : function
Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
be trimmed away, or handled using the default (discard if word count < min_count).
Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
or a callable that accepts parameters (word, count, min_count) and returns either
:attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
of the model.
sorted_vocab : int {1,0}
If 1, sort the vocabulary by descending frequency before assigning word indexes.
batch_words : int
Target size (in words) for batches of examples passed to worker threads (and
thus cython routines).(Larger batches will be passed if individual
texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
min_n : int
Min length of char ngrams to be used for training word representations.
max_n : int
Max length of char ngrams to be used for training word representations. Set `max_n` to be
lesser than `min_n` to avoid char ngrams being used.
word_ngrams : int {1,0}
If 1, uses enriches word vectors with subword(ngrams) information.
If 0, this is equivalent to word2vec.
bucket : int
Character ngrams are hashed into a fixed number of buckets, in order to limit the
memory usage of the model. This option specifies the number of buckets used by the model.
Examples
--------
Initialize and train a `FastText` model
>>> from gensim.models import FastText
>>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
>>>
>>> model = FastText(sentences, min_count=1)
>>> say_vector = model['say'] # get vector for word
>>> of_vector = model['of'] # get vector for out-of-vocab word
"""
# fastText specific params
self.bucket = bucket
self.word_ngrams = word_ngrams
self.min_n = min_n
self.max_n = max_n
if self.word_ngrams <= 1 and self.max_n == 0:
self.bucket = 0
super(FastText, self).__init__(
sentences=sentences, size=size, alpha=alpha, window=window, min_count=min_count,
max_vocab_size=max_vocab_size, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha,
sg=sg, hs=hs, negative=negative, cbow_mean=cbow_mean, hashfxn=hashfxn, iter=iter, null_word=null_word,
trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words)
def initialize_word_vectors(self):
"""Initializes FastTextKeyedVectors instance to store all vocab/ngram vectors for the model."""
self.wv = FastTextKeyedVectors()
self.wv.min_n = self.min_n
self.wv.max_n = self.max_n
def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False):
"""Build vocabulary from a sequence of sentences (can be a once-only generator stream).
Each sentence must be a list of unicode strings.
Parameters
----------
sentences : iterable of iterables
The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
keep_raw_vocab : bool
If not true, delete the raw vocabulary after the scaling is done and free up RAM.
trim_rule : function
Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
be trimmed away, or handled using the default (discard if word count < min_count).
Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
or a callable that accepts parameters (word, count, min_count) and returns either
:attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part
of the model.
progress_per : int
Indicates how many words to process before showing/updating the progress.
update: bool
If true, the new words in `sentences` will be added to model's vocab.
Example
-------
Train a model and update vocab for online training
>>> from gensim.models import FastText
>>> sentences_1 = [["cat", "say", "meow"], ["dog", "say", "woof"]]
>>> sentences_2 = [["dude", "say", "wazzup!"]]
>>>
>>> model = FastText(min_count=1)
>>> model.build_vocab(sentences_1)
>>> model.train(sentences_1, total_examples=model.corpus_count, epochs=model.iter)
>>> model.build_vocab(sentences_2, update=True)
>>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.iter)
"""
if update:
if not len(self.wv.vocab):
raise RuntimeError(
"You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
"First build the vocabulary of your model with a corpus "
"before doing an online update.")
self.old_vocab_len = len(self.wv.vocab)
self.old_hash2index_len = len(self.wv.hash2index)
super(FastText, self).build_vocab(
sentences, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, progress_per=progress_per, update=update)
self.init_ngrams(update=update)
def init_ngrams(self, update=False):
"""Compute ngrams of all words present in vocabulary and stores vectors for only those ngrams.
Vectors for other ngrams are initialized with a random uniform distribution in FastText.
Parameters
----------
update : bool
If True, the new vocab words and their new ngrams word vectors are initialized
with random uniform distribution and updated/added to the existing vocab word and ngram vectors.
"""
if not update:
self.wv.ngrams = {}
self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size), dtype=REAL)
self.syn0_vocab_lockf = ones((len(self.wv.vocab), self.vector_size), dtype=REAL)
self.wv.syn0_ngrams = empty((self.bucket, self.vector_size), dtype=REAL)
self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size), dtype=REAL)
all_ngrams = []
for w, v in self.wv.vocab.items():
self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n)
all_ngrams += self.wv.ngrams_word[w]
all_ngrams = list(set(all_ngrams))
self.num_ngram_vectors = len(all_ngrams)
logger.info("Total number of ngrams is %d", len(all_ngrams))
self.wv.hash2index = {}
ngram_indices = []
new_hash_count = 0
for i, ngram in enumerate(all_ngrams):
ngram_hash = ft_hash(ngram) % self.bucket
if ngram_hash in self.wv.hash2index:
self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
else:
ngram_indices.append(ngram_hash % self.bucket)
self.wv.hash2index[ngram_hash] = new_hash_count
self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
new_hash_count = new_hash_count + 1
self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0)
self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices, axis=0)
self.reset_ngram_weights()
else:
new_ngrams = []
for w, v in self.wv.vocab.items():
self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n)
new_ngrams += [ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams]
new_ngrams = list(set(new_ngrams))
logger.info("Number of new ngrams is %d", len(new_ngrams))
new_hash_count = 0
for i, ngram in enumerate(new_ngrams):
ngram_hash = ft_hash(ngram) % self.bucket
if ngram_hash not in self.wv.hash2index:
self.wv.hash2index[ngram_hash] = new_hash_count + self.old_hash2index_len
self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
new_hash_count = new_hash_count + 1
else:
self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash]
rand_obj = np.random
rand_obj.seed(self.seed)
new_vocab_rows = rand_obj.uniform(
-1.0 / self.vector_size, 1.0 / self.vector_size,
(len(self.wv.vocab) - self.old_vocab_len, self.vector_size)
).astype(REAL)
new_vocab_lockf_rows = ones((len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL)
new_ngram_rows = rand_obj.uniform(
-1.0 / self.vector_size, 1.0 / self.vector_size,
(len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size)
).astype(REAL)
new_ngram_lockf_rows = ones(
(len(self.wv.hash2index) - self.old_hash2index_len,
self.vector_size),
dtype=REAL)
self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows])
self.syn0_vocab_lockf = vstack([self.syn0_vocab_lockf, new_vocab_lockf_rows])
self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows])
self.syn0_ngrams_lockf = vstack([self.syn0_ngrams_lockf, new_ngram_lockf_rows])
def reset_ngram_weights(self):
"""Reset all projection weights to an initial (untrained) state,
but keep the existing vocabulary and their ngrams.
"""
rand_obj = np.random
rand_obj.seed(self.seed)
for index in range(len(self.wv.vocab)):
self.wv.syn0_vocab[index] = rand_obj.uniform(
-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size
).astype(REAL)
for index in range(len(self.wv.hash2index)):
self.wv.syn0_ngrams[index] = rand_obj.uniform(
-1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size
).astype(REAL)
def _do_train_job(self, sentences, alpha, inits):
"""Train a single batch of sentences. Return 2-tuple `(effective word count after
ignoring unknown words and sentence length trimming, total word count)`.
Parameters
----------
sentences : iterable of iterables
The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
alpha : float
The current learning rate.
inits : (:class:`numpy.ndarray`, :class:`numpy.ndarray`)
Each worker's private work memory.
Returns
-------
(int, int)
Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count)
"""
work, neu1 = inits
tally = 0
if self.sg:
tally += train_batch_sg(self, sentences, alpha, work, neu1)
else:
tally += train_batch_cbow(self, sentences, alpha, work, neu1)
return tally, self._raw_word_count(sentences)
def train(self, sentences, total_examples=None, total_words=None,
epochs=None, start_alpha=None, end_alpha=None,
word_count=0, queue_factor=2, report_delay=1.0):
"""Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
For FastText, each sentence must be a list of unicode strings. (Subclasses may accept other examples.)
To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate
progress-percentage logging, either total_examples (count of sentences) or total_words (count of
raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to
:meth:`~gensim.models.fasttext.FastText.build_vocab()`, the count of examples in that corpus
will be available in the model's :attr:`corpus_count` property).
To avoid common mistakes around the model's ability to do multiple training passes itself, an
explicit `epochs` argument **MUST** be provided. In the common and recommended case,
where :meth:`~gensim.models.fasttext.FastText.train()` is only called once,
the model's cached `iter` value should be supplied as `epochs` value.
Parameters
----------
sentences : iterable of iterables
The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
total_examples : int
Count of sentences.
total_words : int
Count of raw words in sentences.
epochs : int
Number of iterations (epochs) over the corpus.
start_alpha : float
Initial learning rate.
end_alpha : float
Final learning rate. Drops linearly from `start_alpha`.
word_count : int
Count of words already trained. Set this to 0 for the usual
case of training on all words in sentences.
queue_factor : int
Multiplier for size of queue (number of workers * queue_factor).
report_delay : float
Seconds to wait before reporting progress.
Examples
--------
>>> from gensim.models import FastText
>>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
>>>
>>> model = FastText(min_count=1)
>>> model.build_vocab(sentences)
>>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
"""
self.neg_labels = []
if self.negative > 0:
# precompute negative labels optimization for pure-python training
self.neg_labels = zeros(self.negative + 1)
self.neg_labels[0] = 1.
Word2Vec.train(
self, sentences, total_examples=self.corpus_count, epochs=self.iter,
start_alpha=self.alpha, end_alpha=self.min_alpha)
self.get_vocab_word_vecs()
def __getitem__(self, word):
"""Get `word` representations in vector space, as a 1D numpy array.
Parameters
----------
word : str
A single word whose vector needs to be returned.
Returns
-------
:class:`numpy.ndarray`
The word's representations in vector space, as a 1D numpy array.
Raises
------
KeyError
For words with all ngrams absent, a KeyError is raised.
Example
-------
>>> from gensim.models import FastText
>>> from gensim.test.utils import datapath
>>>
>>> trained_model = FastText.load_fasttext_format(datapath('lee_fasttext'))
>>> meow_vector = trained_model['hello'] # get vector for word
"""
return self.word_vec(word)
def get_vocab_word_vecs(self):
"""Calculate vectors for words in vocabulary and stores them in `wv.syn0`."""
for w, v in self.wv.vocab.items():
word_vec = np.copy(self.wv.syn0_vocab[v.index])
ngrams = self.wv.ngrams_word[w]
ngram_weights = self.wv.syn0_ngrams
for ngram in ngrams:
word_vec += ngram_weights[self.wv.ngrams[ngram]]
word_vec /= (len(ngrams) + 1)
self.wv.syn0[v.index] = word_vec
def word_vec(self, word, use_norm=False):
"""Get the word's representations in vector space, as a 1D numpy array.
Parameters
----------
word : str
A single word whose vector needs to be returned.
use_norm : bool
If True, returns normalized vector.
Returns
-------
:class:`numpy.ndarray`
The word's representations in vector space, as a 1D numpy array.
Raises
------
KeyError
For words with all ngrams absent, a KeyError is raised.
Example
-------
>>> from gensim.models import FastText
>>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
>>>
>>> model = FastText(sentences, min_count=1)
>>> meow_vector = model.word_vec('meow') # get vector for word
"""
return FastTextKeyedVectors.word_vec(self.wv, word, use_norm=use_norm)
@classmethod
def load_fasttext_format(cls, *args, **kwargs):
"""Load a :class:`~gensim.models.fasttext.FastText` model from a format compatible with
the original fasttext implementation.
Parameters
----------
fname : str
Path to the file.
"""
return Ft_Wrapper.load_fasttext_format(*args, **kwargs)
def save(self, *args, **kwargs):
"""Save the model. This saved model can be loaded again using :func:`~gensim.models.fasttext.FastText.load`,
which supports online training and getting vectors for out-of-vocabulary words.
Parameters
----------
fname : str
Path to the file.
"""
kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm'])
super(FastText, self).save(*args, **kwargs)