laywerrobot/lib/python3.6/site-packages/gensim/models/fasttext.py
2020-08-27 21:55:39 +02:00

1078 lines
50 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Authors: Shiva Manne <manneshiva@gmail.com>, Chinmaya Pancholi <chinmayapancholi13@gmail.com>
# Copyright (C) 2018 RaRe Technologies s.r.o.
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""Learn word representations via Fasttext: `Enriching Word Vectors with Subword Information
<https://arxiv.org/abs/1607.04606>`_.
This module allows training word embeddings from a training corpus with the additional ability to obtain word vectors
for out-of-vocabulary words.
This module contains a fast native C implementation of Fasttext with Python interfaces. It is **not** only a wrapper
around Facebook's implementation.
For a tutorial see `this noteboook
<https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/FastText_Tutorial.ipynb>`_.
**Make sure you have a C compiler before installing Gensim, to use the optimized (compiled) Fasttext
training routines.**
Usage examples
--------------
Initialize and train a model
>>> from gensim.test.utils import common_texts
>>> from gensim.models import FastText
>>>
>>> model = FastText(common_texts, size=4, window=3, min_count=1, iter=10)
Persist a model to disk with
>>> from gensim.test.utils import get_tmpfile
>>>
>>> fname = get_tmpfile("fasttext.model")
>>>
>>> model.save(fname)
>>> model = FastText.load(fname) # you can continue training with the loaded model!
Retrieve word-vector for vocab and out-of-vocab word
>>> existent_word = "computer"
>>> existent_word in model.wv.vocab
True
>>> computer_vec = model.wv[existent_word] # numpy vector of a word
>>>
>>> oov_word = "graph-out-of-vocab"
>>> oov_word in model.wv.vocab
False
>>> oov_vec = model.wv[oov_word] # numpy vector for OOV word
You can perform various NLP word tasks with the model, some of them are already built-in
>>> similarities = model.wv.most_similar(positive=['computer', 'human'], negative=['interface'])
>>> most_similar = similarities[0]
>>>
>>> similarities = model.wv.most_similar_cosmul(positive=['computer', 'human'], negative=['interface'])
>>> most_similar = similarities[0]
>>>
>>> not_matching = model.wv.doesnt_match("human computer interface tree".split())
>>>
>>> sim_score = model.wv.similarity('computer', 'human')
Correlation with human opinion on word similarity
>>> from gensim.test.utils import datapath
>>>
>>> similarities = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))
And on word analogies
>>> analogies_result = model.wv.accuracy(datapath('questions-words.txt'))
"""
import logging
import struct
import numpy as np
from numpy import ones, vstack, empty, float32 as REAL, sum as np_sum
from gensim.models.word2vec import Word2VecVocab, Word2VecTrainables, train_sg_pair, train_cbow_pair
from gensim.models.keyedvectors import Vocab, FastTextKeyedVectors
from gensim.models.base_any2vec import BaseWordEmbeddingsModel
from gensim.models.utils_any2vec import _compute_ngrams, _ft_hash
from gensim.utils import deprecated, call_on_class_only
logger = logging.getLogger(__name__)
try:
from gensim.models.fasttext_inner import train_batch_sg, train_batch_cbow
from gensim.models.fasttext_inner import FAST_VERSION, MAX_WORDS_IN_BATCH
except ImportError:
# failed... fall back to plain numpy (20-80x slower training than the above)
FAST_VERSION = -1
MAX_WORDS_IN_BATCH = 10000
def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):
"""Update CBOW model by training on a sequence of sentences.
Called internally from :meth:`~gensim.models.fasttext.FastText.train`.
Notes
-----
This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version
from :mod:`gensim.models.fasttext_inner` instead.
Parameters
----------
model : :class:`~gensim.models.fasttext.FastText`
Model instance.
sentences : iterable of list of str
Iterable of the sentences.
alpha : float
Learning rate.
work : :class:`numpy.ndarray`, optional
UNUSED.
neu1 : :class:`numpy.ndarray`, optional
UNUSED.
Returns
-------
int
Effective number of words trained.
"""
result = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window)
start = max(0, pos - model.window + reduced_window)
window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
vocab_subwords_indices = []
ngrams_subwords_indices = []
for index in word2_indices:
vocab_subwords_indices += [index]
ngrams_subwords_indices.extend(model.wv.buckets_word[index])
l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size
l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size
l1 = np_sum([l1_vocab, l1_ngrams], axis=0)
subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices]
if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean:
l1 /= (len(subwords_indices[0]) + len(subwords_indices[1]))
# train on the sliding window for target word
train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True)
result += len(word_vocabs)
return result
def train_batch_sg(model, sentences, alpha, work=None, neu1=None):
"""Update skip-gram model by training on a sequence of sentences.
Called internally from :meth:`~gensim.models.fasttext.FastText.train`.
Notes
-----
This is the non-optimized, Python version. If you have cython installed, gensim will use the optimized version
from :mod:`gensim.models.fasttext_inner` instead.
Parameters
----------
model : :class:`~gensim.models.fasttext.FastText`
`FastText` instance.
sentences : iterable of list of str
Iterable of the sentences directly from disk/network.
alpha : float
Learning rate.
work : :class:`numpy.ndarray`, optional
UNUSED.
neu1 : :class:`numpy.ndarray`, optional
UNUSED.
Returns
-------
int
Effective number of words trained.
"""
result = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
# now go over all words from the (reduced) window, predicting each one in turn
start = max(0, pos - model.window + reduced_window)
subwords_indices = (word.index,)
subwords_indices += model.wv.buckets_word[word.index]
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
if pos2 != pos: # don't train on the `word` itself
train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True)
result += len(word_vocabs)
return result
FASTTEXT_FILEFORMAT_MAGIC = 793712314
class FastText(BaseWordEmbeddingsModel):
"""Train, use and evaluate word representations learned using the method
described in `Enriching Word Vectors with Subword Information <https://arxiv.org/abs/1607.04606>`_, aka FastText.
The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save` and
:meth:`~gensim.models.fasttext.FastText.load` methods, or loaded from a format compatible with the original
Fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format`.
Some important internal attributes are the following:
Attributes
----------
wv : :class:`~gensim.models.keyedvectors.FastTextKeyedVectors`
This object essentially contains the mapping between words and embeddings. These are similar to the embeddings
computed in the :class:`~gensim.models.word2vec.Word2Vec`, however here we also include vectors for n-grams.
This allows the model to compute embeddings even for **unseen** words (that do not exist in the vocabulary),
as the aggregate of the n-grams included in the word. After training the model, this attribute can be used
directly to query those embeddings in various ways. Check the module level docstring from some examples.
vocabulary : :class:`~gensim.models.fasttext.FastTextVocab`
This object represents the vocabulary of the model.
Besides keeping track of all unique words, this object provides extra functionality, such as
constructing a huffman tree (frequent words are closer to the root), or discarding extremely rare words.
trainables : :class:`~gensim.models.fasttext.FastTextTrainables`
This object represents the inner shallow neural network used to train the embeddings. This is very
similar to the network of the :class:`~gensim.models.word2vec.Word2Vec` model, but it also trains weights
for the N-Grams (sequences of more than 1 words). The semantics of the network are almost the same as
the one used for the :class:`~gensim.models.word2vec.Word2Vec` model.
You can think of it as a NN with a single projection and hidden layer which we train on the corpus.
The weights are then used as our embeddings. An important difference however between the two models, is the
scoring function used to compute the loss. In the case of FastText, this is modified in word to also account
for the internal structure of words, besides their concurrence counts.
"""
def __init__(self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6,
sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=()):
"""
Parameters
----------
sentences : iterable of list of str, optional
Can be simply a list of lists of tokens, but for larger corpora,
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it
in some other way.
min_count : int, optional
The model ignores all words with total frequency lower than this.
size : int, optional
Dimensionality of the word vectors.
window : int, optional
The maximum distance between the current and predicted word within a sentence.
workers : int, optional
Use these many worker threads to train the model (=faster training with multicore machines).
alpha : float, optional
The initial learning rate.
min_alpha : float, optional
Learning rate will linearly drop to `min_alpha` as training progresses.
sg : {1, 0}, optional
Training algorithm: skip-gram if `sg=1`, otherwise CBOW.
hs : {1,0}, optional
If 1, hierarchical softmax will be used for model training.
If set to 0, and `negative` is non-zero, negative sampling will be used.
seed : int, optional
Seed for the random number generator. Initial vectors for each word are seeded with a hash of
the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
use of the `PYTHONHASHSEED` environment variable to control hash randomization).
max_vocab_size : int, optional
Limits the RAM during vocabulary building; if there are more unique
words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
Set to `None` for no limit.
sample : float, optional
The threshold for configuring which higher-frequency words are randomly downsampled,
useful range is (0, 1e-5).
negative : int, optional
If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
should be drawn (usually between 5-20).
If set to 0, no negative sampling is used.
ns_exponent : float, optional
The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion
to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more
than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper.
More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that
other values may perform better for recommendation applications.
cbow_mean : {1,0}, optional
If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
hashfxn : function, optional
Hash function to use to randomly initialize weights, for increased training reproducibility.
iter : int, optional
Number of iterations (epochs) over the corpus.
trim_rule : function, optional
Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
be trimmed away, or handled using the default (discard if word count < min_count).
Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
or a callable that accepts parameters (word, count, min_count) and returns either
:attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
The rule, if given, is only used to prune vocabulary during
:meth:`~gensim.models.fasttext.FastText.build_vocab` and is not stored as part of themodel.
The input parameters are of the following types:
* `word` (str) - the word we are examining
* `count` (int) - the word's frequency count in the corpus
* `min_count` (int) - the minimum count threshold.
sorted_vocab : {1,0}, optional
If 1, sort the vocabulary by descending frequency before assigning word indices.
batch_words : int, optional
Target size (in words) for batches of examples passed to worker threads (and
thus cython routines).(Larger batches will be passed if individual
texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
min_n : int, optional
Minimum length of char n-grams to be used for training word representations.
max_n : int, optional
Max length of char ngrams to be used for training word representations. Set `max_n` to be
lesser than `min_n` to avoid char ngrams being used.
word_ngrams : {1,0}, optional
If 1, uses enriches word vectors with subword(n-grams) information.
If 0, this is equivalent to :class:`~gensim.models.word2vec.Word2Vec`.
bucket : int, optional
Character ngrams are hashed into a fixed number of buckets, in order to limit the
memory usage of the model. This option specifies the number of buckets used by the model.
callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional
List of callbacks that need to be executed/run at specific stages during training.
Examples
--------
Initialize and train a `FastText` model::
>>> from gensim.models import FastText
>>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
>>>
>>> model = FastText(sentences, min_count=1)
>>> say_vector = model['say'] # get vector for a word
>>> of_vector = model['of'] # get vector for an out-of-vocab word
"""
self.load = call_on_class_only
self.load_fasttext_format = call_on_class_only
self.callbacks = callbacks
self.word_ngrams = int(word_ngrams)
if self.word_ngrams <= 1 and max_n == 0:
bucket = 0
self.wv = FastTextKeyedVectors(size, min_n, max_n)
self.vocabulary = FastTextVocab(
max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent)
self.trainables = FastTextTrainables(
vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn)
self.wv.bucket = self.bucket
super(FastText, self).__init__(
sentences=sentences, workers=workers, vector_size=size, epochs=iter, callbacks=callbacks,
batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed,
hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, fast_version=FAST_VERSION)
@property
@deprecated("Attribute will be removed in 4.0.0, use wv.min_n instead")
def min_n(self):
return self.wv.min_n
@property
@deprecated("Attribute will be removed in 4.0.0, use wv.max_n instead")
def max_n(self):
return self.wv.max_n
@property
@deprecated("Attribute will be removed in 4.0.0, use trainables.bucket instead")
def bucket(self):
return self.trainables.bucket
@property
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_vocab_lockf instead")
def syn0_vocab_lockf(self):
return self.trainables.vectors_vocab_lockf
@syn0_vocab_lockf.setter
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_vocab_lockf instead")
def syn0_vocab_lockf(self, value):
self.trainables.vectors_vocab_lockf = value
@syn0_vocab_lockf.deleter
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_vocab_lockf instead")
def syn0_vocab_lockf(self):
del self.trainables.vectors_vocab_lockf
@property
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_ngrams_lockf instead")
def syn0_ngrams_lockf(self):
return self.trainables.vectors_ngrams_lockf
@syn0_ngrams_lockf.setter
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_ngrams_lockf instead")
def syn0_ngrams_lockf(self, value):
self.trainables.vectors_ngrams_lockf = value
@syn0_ngrams_lockf.deleter
@deprecated("Attribute will be removed in 4.0.0, use self.trainables.vectors_ngrams_lockf instead")
def syn0_ngrams_lockf(self):
del self.trainables.vectors_ngrams_lockf
@property
@deprecated("Attribute will be removed in 4.0.0, use self.wv.num_ngram_vectors instead")
def num_ngram_vectors(self):
return self.wv.num_ngram_vectors
def build_vocab(self, sentences, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs):
"""Build vocabulary from a sequence of sentences (can be a once-only generator stream).
Each sentence must be a list of unicode strings.
Parameters
----------
sentences : iterable of list of str
Can be simply a list of lists of tokens, but for larger corpora,
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
update : bool
If true, the new words in `sentences` will be added to model's vocab.
progress_per : int
Indicates how many words to process before showing/updating the progress.
keep_raw_vocab : bool
If not true, delete the raw vocabulary after the scaling is done and free up RAM.
trim_rule : function, optional
Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
be trimmed away, or handled using the default (discard if word count < min_count).
Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
or a callable that accepts parameters (word, count, min_count) and returns either
:attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
The rule, if given, is only used to prune vocabulary during
:meth:`~gensim.models.fasttext.FastText.build_vocab` and is not stored as part of the model.
The input parameters are of the following types:
* `word` (str) - the word we are examining
* `count` (int) - the word's frequency count in the corpus
* `min_count` (int) - the minimum count threshold.
**kwargs
Additional key word parameters passed to
:meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.build_vocab`.
Examples
--------
Train a model and update vocab for online training
>>> from gensim.models import FastText
>>> sentences_1 = [["cat", "say", "meow"], ["dog", "say", "woof"]]
>>> sentences_2 = [["dude", "say", "wazzup!"]]
>>>
>>> model = FastText(min_count=1)
>>> model.build_vocab(sentences_1)
>>> model.train(sentences_1, total_examples=model.corpus_count, epochs=model.iter)
>>>
>>> model.build_vocab(sentences_2, update=True)
>>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.iter)
"""
if update:
if not len(self.wv.vocab):
raise RuntimeError(
"You cannot do an online vocabulary-update of a model which has no prior vocabulary. "
"First build the vocabulary of your model with a corpus "
"before doing an online update.")
self.vocabulary.old_vocab_len = len(self.wv.vocab)
self.trainables.old_hash2index_len = len(self.wv.hash2index)
return super(FastText, self).build_vocab(
sentences, update=update, progress_per=progress_per,
keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs)
def _set_train_params(self, **kwargs):
pass
def _clear_post_train(self):
"""Clear the model's internal structures after training has finished to free up RAM."""
self.wv.vectors_norm = None
self.wv.vectors_vocab_norm = None
self.wv.vectors_ngrams_norm = None
self.wv.buckets_word = None
def estimate_memory(self, vocab_size=None, report=None):
vocab_size = vocab_size or len(self.wv.vocab)
vec_size = self.vector_size * np.dtype(np.float32).itemsize
l1_size = self.layer1_size * np.dtype(np.float32).itemsize
report = report or {}
report['vocab'] = len(self.wv.vocab) * (700 if self.hs else 500)
report['syn0_vocab'] = len(self.wv.vocab) * vec_size
num_buckets = self.bucket
if self.hs:
report['syn1'] = len(self.wv.vocab) * l1_size
if self.negative:
report['syn1neg'] = len(self.wv.vocab) * l1_size
if self.word_ngrams > 0 and self.wv.vocab:
buckets = set()
num_ngrams = 0
for word in self.wv.vocab:
ngrams = _compute_ngrams(word, self.min_n, self.max_n)
num_ngrams += len(ngrams)
buckets.update(_ft_hash(ng) % self.bucket for ng in ngrams)
num_buckets = len(buckets)
report['syn0_ngrams'] = len(buckets) * vec_size
# A tuple (48 bytes) with num_ngrams_word ints (8 bytes) for each word
# Only used during training, not stored with the model
report['buckets_word'] = 48 * len(self.wv.vocab) + 8 * num_ngrams
elif self.word_ngrams > 0:
logger.warn(
'subword information is enabled, but no vocabulary could be found, estimated required memory might be '
'inaccurate!'
)
report['total'] = sum(report.values())
logger.info(
"estimated required memory for %i words, %i buckets and %i dimensions: %i bytes",
len(self.wv.vocab), num_buckets, self.vector_size, report['total']
)
return report
def _do_train_job(self, sentences, alpha, inits):
"""Train a single batch of sentences. Return 2-tuple `(effective word count after
ignoring unknown words and sentence length trimming, total word count)`.
Parameters
----------
sentences : iterable of list of str
Can be simply a list of lists of tokens, but for larger corpora,
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
alpha : float
The current learning rate.
inits : tuple of (:class:`numpy.ndarray`, :class:`numpy.ndarray`)
Each worker's private work memory.
Returns
-------
(int, int)
Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count)
"""
work, neu1 = inits
tally = 0
if self.sg:
tally += train_batch_sg(self, sentences, alpha, work, neu1)
else:
tally += train_batch_cbow(self, sentences, alpha, work, neu1)
return tally, self._raw_word_count(sentences)
def train(self, sentences, total_examples=None, total_words=None,
epochs=None, start_alpha=None, end_alpha=None,
word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs):
"""Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
For FastText, each sentence must be a list of unicode strings.
To support linear learning-rate decay from (initial) `alpha` to `min_alpha`, and accurate
progress-percentage logging, either `total_examples` (count of sentences) or `total_words` (count of
raw words in sentences) **MUST** be provided. If `sentences` is the same corpus
that was provided to :meth:`~gensim.models.fasttext.FastText.build_vocab` earlier,
you can simply use `total_examples=self.corpus_count`.
To avoid common mistakes around the model's ability to do multiple training passes itself, an
explicit `epochs` argument **MUST** be provided. In the common and recommended case
where :meth:`~gensim.models.fasttext.FastText.train` is only called once, you can set `epochs=self.iter`.
Parameters
----------
sentences : iterable of iterables
The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora,
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
total_examples : int
Count of sentences.
total_words : int
Count of raw words in sentences.
epochs : int
Number of iterations (epochs) over the corpus.
start_alpha : float, optional
Initial learning rate. If supplied, replaces the starting `alpha` from the constructor,
for this one call to :meth:`~gensim.models.fasttext.FastText.train`.
Use only if making multiple calls to :meth:`~gensim.models.fasttext.FastText.train`, when you want to manage
the alpha learning-rate yourself (not recommended).
end_alpha : float, optional
Final learning rate. Drops linearly from `start_alpha`.
If supplied, this replaces the final `min_alpha` from the constructor, for this one call to
:meth:`~gensim.models.fasttext.FastText.train`.
Use only if making multiple calls to :meth:`~gensim.models.fasttext.FastText.train`, when you want to manage
the alpha learning-rate yourself (not recommended).
word_count : int
Count of words already trained. Set this to 0 for the usual
case of training on all words in sentences.
queue_factor : int
Multiplier for size of queue (number of workers * queue_factor).
report_delay : float
Seconds to wait before reporting progress.
callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`
List of callbacks that need to be executed/run at specific stages during training.
Examples
--------
>>> from gensim.models import FastText
>>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
>>>
>>> model = FastText(min_count=1)
>>> model.build_vocab(sentences)
>>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
"""
super(FastText, self).train(
sentences, total_examples=total_examples, total_words=total_words,
epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks)
self.trainables.get_vocab_word_vecs(self.wv)
def init_sims(self, replace=False):
"""
Precompute L2-normalized vectors.
Parameters
----------
replace : bool
If True, forget the original vectors and only keep the normalized ones to save RAM.
"""
# init_sims() resides in KeyedVectors because it deals with input layer mainly, but because the
# hidden layer is not an attribute of KeyedVectors, it has to be deleted in this class.
# The normalizing of input layer happens inside of KeyedVectors.
if replace and hasattr(self.trainables, 'syn1'):
del self.trainables.syn1
self.wv.init_sims(replace)
def clear_sims(self):
"""Remove all L2-normalized word vectors from the model, to free up memory.
You can recompute them later again using the :meth:`~gensim.models.fasttext.FastText.init_sims` method.
"""
self._clear_post_train()
@deprecated("Method will be removed in 4.0.0, use self.wv.__getitem__() instead")
def __getitem__(self, words):
"""Deprecated. Use self.wv.__getitem__() instead.
Refer to the documentation for :meth:`gensim.models.keyedvectors.KeyedVectors.__getitem__`
"""
return self.wv.__getitem__(words)
@deprecated("Method will be removed in 4.0.0, use self.wv.__contains__() instead")
def __contains__(self, word):
"""Deprecated. Use self.wv.__contains__() instead.
Refer to the documentation for :meth:`gensim.models.keyedvectors.KeyedVectors.__contains__`
"""
return self.wv.__contains__(word)
@classmethod
def load_fasttext_format(cls, model_file, encoding='utf8'):
"""Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files.
Notes
------
Due to limitations in the FastText API, you cannot continue training with a model loaded this way.
Parameters
----------
model_file : str
Path to the FastText output files.
FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin`
Expected value for this example: `/path/to/model` or `/path/to/model.bin`,
as Gensim requires only `.bin` file to the load entire fastText model.
encoding : str, optional
Specifies the file encoding.
Returns
-------
:class: `~gensim.models.fasttext.FastText`
The loaded model.
"""
model = cls()
if not model_file.endswith('.bin'):
model_file += '.bin'
model.file_name = model_file
model.load_binary_data(encoding=encoding)
return model
def load_binary_data(self, encoding='utf8'):
"""Load data from a binary file created by Facebook's native FastText.
Parameters
----------
encoding : str, optional
Specifies the encoding.
"""
# TODO use smart_open again when https://github.com/RaRe-Technologies/smart_open/issues/207 will be fixed
with open(self.file_name, 'rb') as f:
self._load_model_params(f)
self._load_dict(f, encoding=encoding)
self._load_vectors(f)
def _load_model_params(self, file_handle):
"""Load model parameters from Facebook's native fasttext file.
Parameters
----------
file_handle : file-like object
Handle to an open file.
"""
magic, version = self.struct_unpack(file_handle, '@2i')
if magic == FASTTEXT_FILEFORMAT_MAGIC: # newer format
self.new_format = True
dim, ws, epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = \
self.struct_unpack(file_handle, '@12i1d')
else: # older format
self.new_format = False
dim = magic
ws = version
epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@10i1d')
# Parameters stored by [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc)
self.wv.vector_size = dim
self.vector_size = dim
self.window = ws
self.epochs = epoch
self.vocabulary.min_count = min_count
self.negative = neg
self.hs = loss == 1
self.sg = model == 2
self.trainables.bucket = bucket
self.wv.bucket = bucket
self.wv.min_n = minn
self.wv.max_n = maxn
self.vocabulary.sample = t
def _load_dict(self, file_handle, encoding='utf8'):
"""Load a previously saved dictionary from disk, stored in Facebook's native fasttext format.
Parameters
----------
file_handle : file-like object
The opened file handle to the persisted dictionary.
encoding : str
Specifies the encoding.
"""
vocab_size, nwords, nlabels = self.struct_unpack(file_handle, '@3i')
# Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
if nlabels > 0:
raise NotImplementedError("Supervised fastText models are not supported")
logger.info("loading %s words for fastText model from %s", vocab_size, self.file_name)
self.struct_unpack(file_handle, '@1q') # number of tokens
if self.new_format:
pruneidx_size, = self.struct_unpack(file_handle, '@q')
for i in range(vocab_size):
word_bytes = b''
char_byte = file_handle.read(1)
# Read vocab word
while char_byte != b'\x00':
word_bytes += char_byte
char_byte = file_handle.read(1)
word = word_bytes.decode(encoding)
count, _ = self.struct_unpack(file_handle, '@qb')
self.wv.vocab[word] = Vocab(index=i, count=count)
self.wv.index2word.append(word)
assert len(self.wv.vocab) == nwords, (
'mismatch between final vocab size ({} words), '
'and expected number of words ({} words)'.format(len(self.wv.vocab), nwords))
if len(self.wv.vocab) != vocab_size:
# expecting to log this warning only for pretrained french vector, wiki.fr
logger.warning(
"mismatch between final vocab size (%s words), and expected vocab size (%s words)",
len(self.wv.vocab), vocab_size
)
if self.new_format:
for j in range(pruneidx_size):
self.struct_unpack(file_handle, '@2i')
def _load_vectors(self, file_handle):
"""Load word vectors stored in Facebook's native fasttext format from disk.
Parameters
----------
file_handle : file-like object
Open file handle to persisted vectors.
"""
if self.new_format:
self.struct_unpack(file_handle, '@?') # bool quant_input in fasttext.cc
num_vectors, dim = self.struct_unpack(file_handle, '@2q')
# Vectors stored by [Matrix::save](https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc)
assert self.wv.vector_size == dim, (
'mismatch between vector size in model params ({}) and model vectors ({})'
.format(self.wv.vector_size, dim)
)
float_size = struct.calcsize('@f')
if float_size == 4:
dtype = np.dtype(np.float32)
elif float_size == 8:
dtype = np.dtype(np.float64)
self.num_original_vectors = num_vectors
self.wv.vectors_ngrams = np.fromfile(file_handle, dtype=dtype, count=num_vectors * dim)
self.wv.vectors_ngrams = self.wv.vectors_ngrams.reshape((num_vectors, dim))
assert self.wv.vectors_ngrams.shape == (
self.trainables.bucket + len(self.wv.vocab), self.wv.vector_size), \
'mismatch between actual weight matrix shape {} and expected shape {}'\
.format(
self.wv.vectors_ngrams.shape, (self.trainables.bucket + len(self.wv.vocab), self.wv.vector_size)
)
self.trainables.init_ngrams_post_load(self.file_name, self.wv)
self._clear_post_train()
def struct_unpack(self, file_handle, fmt):
"""Read a single object from an open file.
Parameters
----------
file_handle : file_like object
Handle to an open file
fmt : str
Byte format in which the structure is saved.
Returns
-------
Tuple of (str)
Unpacked structure.
"""
num_bytes = struct.calcsize(fmt)
return struct.unpack(fmt, file_handle.read(num_bytes))
def save(self, *args, **kwargs):
"""Save the Fasttext model. This saved model can be loaded again using
:meth:`~gensim.models.fasttext.FastText.load`, which supports incremental training
and getting vectors for out-of-vocabulary words.
Parameters
----------
fname : str
Store the model to this file.
See Also
--------
:meth:`~gensim.models.fasttext.FastText.load`
Load :class:`~gensim.models.fasttext.FastText` model.
"""
kwargs['ignore'] = kwargs.get(
'ignore', ['vectors_norm', 'vectors_vocab_norm', 'vectors_ngrams_norm', 'buckets_word'])
super(FastText, self).save(*args, **kwargs)
@classmethod
def load(cls, *args, **kwargs):
"""Load a previously saved `FastText` model.
Parameters
----------
fname : str
Path to the saved file.
Returns
-------
:class:`~gensim.models.fasttext.FastText`
Loaded model.
See Also
--------
:meth:`~gensim.models.fasttext.FastText.save`
Save :class:`~gensim.models.fasttext.FastText` model.
"""
try:
model = super(FastText, cls).load(*args, **kwargs)
if not hasattr(model.trainables, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'):
model.trainables.vectors_vocab_lockf = ones(len(model.trainables.vectors), dtype=REAL)
if not hasattr(model.trainables, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'):
model.trainables.vectors_ngrams_lockf = ones(len(model.trainables.vectors), dtype=REAL)
return model
except AttributeError:
logger.info('Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.')
from gensim.models.deprecated.fasttext import load_old_fasttext
return load_old_fasttext(*args, **kwargs)
@deprecated("Method will be removed in 4.0.0, use self.wv.accuracy() instead")
def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_insensitive=True):
most_similar = most_similar or FastTextKeyedVectors.most_similar
return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive)
class FastTextVocab(Word2VecVocab):
"""Vocabulary used by :class:`~gensim.models.fasttext.FastText`."""
def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, ns_exponent=0.75):
super(FastTextVocab, self).__init__(
max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
sorted_vocab=sorted_vocab, null_word=null_word, ns_exponent=ns_exponent)
def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, trim_rule=None,
min_count=None, sample=None, dry_run=False):
report_values = super(FastTextVocab, self).prepare_vocab(
hs, negative, wv, update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule,
min_count=min_count, sample=sample, dry_run=dry_run)
return report_values
class FastTextTrainables(Word2VecTrainables):
"""Represents the inner shallow neural network used to train :class:`~gensim.models.fasttext.FastText`."""
def __init__(self, vector_size=100, seed=1, hashfxn=hash, bucket=2000000):
super(FastTextTrainables, self).__init__(
vector_size=vector_size, seed=seed, hashfxn=hashfxn)
self.bucket = int(bucket)
def prepare_weights(self, hs, negative, wv, update=False, vocabulary=None):
super(FastTextTrainables, self).prepare_weights(hs, negative, wv, update=update, vocabulary=vocabulary)
self.init_ngrams_weights(wv, update=update, vocabulary=vocabulary)
def init_ngrams_weights(self, wv, update=False, vocabulary=None):
"""Compute ngrams of all words present in vocabulary and stores vectors for only those ngrams.
Vectors for other ngrams are initialized with a random uniform distribution in FastText.
Parameters
----------
update : bool
If True, the new vocab words and their new ngrams word vectors are initialized
with random uniform distribution and updated/added to the existing vocab word and ngram vectors.
"""
if not update:
wv.vectors_vocab = empty((len(wv.vocab), wv.vector_size), dtype=REAL)
self.vectors_vocab_lockf = ones((len(wv.vocab), wv.vector_size), dtype=REAL)
wv.vectors_ngrams = empty((self.bucket, wv.vector_size), dtype=REAL)
self.vectors_ngrams_lockf = ones((self.bucket, wv.vector_size), dtype=REAL)
wv.hash2index = {}
wv.buckets_word = {}
ngram_indices = []
for word, vocab in wv.vocab.items():
buckets = []
for ngram in _compute_ngrams(word, wv.min_n, wv.max_n):
ngram_hash = _ft_hash(ngram) % self.bucket
if ngram_hash not in wv.hash2index:
wv.hash2index[ngram_hash] = len(ngram_indices)
ngram_indices.append(ngram_hash)
buckets.append(wv.hash2index[ngram_hash])
wv.buckets_word[vocab.index] = tuple(buckets)
wv.num_ngram_vectors = len(ngram_indices)
logger.info("Total number of ngrams is %d", wv.num_ngram_vectors)
wv.vectors_ngrams = wv.vectors_ngrams.take(ngram_indices, axis=0)
self.vectors_ngrams_lockf = self.vectors_ngrams_lockf.take(ngram_indices, axis=0)
self.reset_ngrams_weights(wv)
else:
wv.buckets_word = {}
num_new_ngrams = 0
for word, vocab in wv.vocab.items():
buckets = []
for ngram in _compute_ngrams(word, wv.min_n, wv.max_n):
ngram_hash = _ft_hash(ngram) % self.bucket
if ngram_hash not in wv.hash2index:
wv.hash2index[ngram_hash] = num_new_ngrams + self.old_hash2index_len
num_new_ngrams += 1
buckets.append(wv.hash2index[ngram_hash])
wv.buckets_word[vocab.index] = tuple(buckets)
wv.num_ngram_vectors += num_new_ngrams
logger.info("Number of new ngrams is %d", num_new_ngrams)
rand_obj = np.random
rand_obj.seed(self.seed)
new_vocab_rows = rand_obj.uniform(
-1.0 / wv.vector_size, 1.0 / wv.vector_size,
(len(wv.vocab) - vocabulary.old_vocab_len, wv.vector_size)
).astype(REAL)
new_vocab_lockf_rows = ones(
(len(wv.vocab) - vocabulary.old_vocab_len, wv.vector_size), dtype=REAL)
new_ngram_rows = rand_obj.uniform(
-1.0 / wv.vector_size, 1.0 / wv.vector_size,
(len(wv.hash2index) - self.old_hash2index_len, wv.vector_size)
).astype(REAL)
new_ngram_lockf_rows = ones(
(len(wv.hash2index) - self.old_hash2index_len, wv.vector_size), dtype=REAL)
wv.vectors_vocab = vstack([wv.vectors_vocab, new_vocab_rows])
self.vectors_vocab_lockf = vstack([self.vectors_vocab_lockf, new_vocab_lockf_rows])
wv.vectors_ngrams = vstack([wv.vectors_ngrams, new_ngram_rows])
self.vectors_ngrams_lockf = vstack([self.vectors_ngrams_lockf, new_ngram_lockf_rows])
def reset_ngrams_weights(self, wv):
"""Reset all projection weights to an initial (untrained) state,
but keep the existing vocabulary and their ngrams.
"""
rand_obj = np.random
rand_obj.seed(self.seed)
for index in range(len(wv.vocab)):
wv.vectors_vocab[index] = rand_obj.uniform(
-1.0 / wv.vector_size, 1.0 / wv.vector_size, wv.vector_size
).astype(REAL)
for index in range(len(wv.hash2index)):
wv.vectors_ngrams[index] = rand_obj.uniform(
-1.0 / wv.vector_size, 1.0 / wv.vector_size, wv.vector_size
).astype(REAL)
def get_vocab_word_vecs(self, wv):
"""Calculate vectors for words in vocabulary and stores them in `vectors`."""
for w, v in wv.vocab.items():
word_vec = np.copy(wv.vectors_vocab[v.index])
ngrams = _compute_ngrams(w, wv.min_n, wv.max_n)
ngram_weights = wv.vectors_ngrams
for ngram in ngrams:
word_vec += ngram_weights[wv.hash2index[_ft_hash(ngram) % self.bucket]]
word_vec /= (len(ngrams) + 1)
wv.vectors[v.index] = word_vec
def init_ngrams_post_load(self, file_name, wv):
"""Compute ngrams of all words present in vocabulary, and store vectors for only those ngrams.
Vectors for other ngrams are initialized with a random uniform distribution in FastText. These
vectors are discarded here to save space.
"""
wv.vectors = np.zeros((len(wv.vocab), wv.vector_size), dtype=REAL)
for w, vocab in wv.vocab.items():
wv.vectors[vocab.index] += np.array(wv.vectors_ngrams[vocab.index])
ngram_indices = []
wv.num_ngram_vectors = 0
for word in wv.vocab.keys():
for ngram in _compute_ngrams(word, wv.min_n, wv.max_n):
ngram_hash = _ft_hash(ngram) % self.bucket
if ngram_hash in wv.hash2index:
continue
wv.hash2index[ngram_hash] = len(ngram_indices)
ngram_indices.append(len(wv.vocab) + ngram_hash)
wv.num_ngram_vectors = len(ngram_indices)
wv.vectors_ngrams = wv.vectors_ngrams.take(ngram_indices, axis=0)
ngram_weights = wv.vectors_ngrams
logger.info(
"loading weights for %s words for fastText model from %s",
len(wv.vocab), file_name
)
for w, vocab in wv.vocab.items():
word_ngrams = _compute_ngrams(w, wv.min_n, wv.max_n)
for word_ngram in word_ngrams:
vec_idx = wv.hash2index[_ft_hash(word_ngram) % self.bucket]
wv.vectors[vocab.index] += np.array(ngram_weights[vec_idx])
wv.vectors[vocab.index] /= (len(word_ngrams) + 1)
logger.info(
"loaded %s weight matrix for fastText model from %s",
wv.vectors.shape, file_name
)